57
57
delta_result create_delta_index(source_info *src,
59
delta_index **fresh) nogil
60
int max_entries) nogil
60
61
delta_result create_delta_index_from_delta(source_info *delta,
62
63
delta_index **fresh) nogil
70
71
unsigned char *top) nogil
71
72
unsigned long sizeof_delta_index(delta_index *index)
72
73
Py_ssize_t DELTA_SIZE_MIN
74
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
75
int get_entry_summary(delta_index *index, int pos,
76
unsigned int *global_offset, unsigned int *hash_val)
77
unsigned int rabin_hash (unsigned char *data)
75
80
cdef void *safe_malloc(size_t count) except NULL:
113
118
return AssertionError("Unrecognised delta result code: %d" % result)
121
def _rabin_hash(content):
122
if not PyString_CheckExact(content):
123
raise ValueError('content must be a string')
124
if len(content) < 16:
125
raise ValueError('content must be at least 16 bytes long')
126
# Try to cast it to an int, if it can fit
127
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
116
130
cdef class DeltaIndex:
118
132
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
123
137
cdef delta_index *_index
124
138
cdef public unsigned long _source_offset
125
139
cdef readonly unsigned int _max_num_sources
140
cdef public int _max_bytes_to_index
127
def __init__(self, source=None):
142
def __init__(self, source=None, max_bytes_to_index=None):
128
143
self._sources = []
129
144
self._index = NULL
130
145
self._max_num_sources = 65000
131
146
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
132
147
* self._max_num_sources)
133
148
self._source_offset = 0
149
self._max_bytes_to_index = 0
150
if max_bytes_to_index is not None:
151
self._max_bytes_to_index = max_bytes_to_index
135
153
if source is not None:
136
154
self.add_source(source, 0)
164
182
def _has_index(self):
165
183
return (self._index != NULL)
185
def _dump_index(self):
186
"""Dump the pointers in the index.
188
This is an arbitrary layout, used for testing. It is not meant to be
189
used in production code.
191
:return: (hash_list, entry_list)
192
hash_list A list of offsets, so hash[i] points to the 'hash
193
bucket' starting at the given offset and going until
195
entry_list A list of (text_offset, hash_val). text_offset is the
196
offset in the "source" texts, and hash_val is the RABIN
197
hash for that offset.
198
Note that the entry should be in the hash bucket
200
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
203
cdef unsigned int text_offset
204
cdef unsigned int hash_val
205
cdef unsigned int hash_offset
206
if self._index == NULL:
210
while get_hash_offset(self._index, pos, &hash_offset):
211
hash_list.append(int(hash_offset))
215
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
216
# Map back using 'int' so that we don't get Long everywhere, when
217
# almost everything is <2**31.
218
val = tuple(map(int, [text_offset, hash_val]))
219
entry_list.append(val)
221
return hash_list, entry_list
167
223
def add_delta_source(self, delta, unadded_bytes):
168
224
"""Add a new delta to the source texts.
207
263
:param source: The text in question, this must be a byte string
208
264
:param unadded_bytes: Assume there are this many bytes that didn't get
209
265
added between this source and the end of the previous source.
266
:param max_pointers: Add no more than this many entries to the index.
267
By default, we sample every 16 bytes, if that would require more
268
than max_entries, we will reduce the sampling rate.
269
A value of 0 means unlimited, None means use the default limit.
211
271
cdef char *c_source
212
272
cdef Py_ssize_t c_source_size
237
298
# We delay creating the index on the first insert
238
299
if source_location != 0:
240
res = create_delta_index(src, self._index, &index)
301
res = create_delta_index(src, self._index, &index,
302
self._max_bytes_to_index)
241
303
if res != DELTA_OK:
242
304
raise _translate_delta_failure(res)
243
305
if index != self._index:
254
316
# We know that self._index is already NULL, so create_delta_index
255
317
# will always create a new index unless there's a malloc failure
257
res = create_delta_index(&self._source_infos[0], NULL, &index)
319
res = create_delta_index(&self._source_infos[0], NULL, &index,
320
self._max_bytes_to_index)
258
321
if res != DELTA_OK:
259
322
raise _translate_delta_failure(res)
260
323
self._index = index