34
32
cdef extern from *:
35
33
ctypedef unsigned long size_t
36
void * malloc(size_t) nogil
37
void * realloc(void *, size_t) nogil
38
void free(void *) nogil
39
void memcpy(void *, void *, size_t) nogil
35
void * realloc(void *, size_t)
37
void memcpy(void *, void *, size_t)
42
40
cdef extern from "delta.h":
46
44
unsigned long agg_offset
47
45
struct delta_index:
49
ctypedef enum delta_result:
57
delta_result create_delta_index(source_info *src,
60
int max_entries) nogil
61
delta_result create_delta_index_from_delta(source_info *delta,
63
delta_index **fresh) nogil
64
void free_delta_index(delta_index *index) nogil
65
delta_result create_delta(delta_index *indexes,
66
void *buf, unsigned long bufsize,
67
unsigned long *delta_size,
68
unsigned long max_delta_size,
69
void **delta_data) nogil
47
delta_index * create_delta_index(source_info *src, delta_index *old)
48
delta_index * create_delta_index_from_delta(source_info *delta,
50
void free_delta_index(delta_index *index)
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size)
70
54
unsigned long get_delta_hdr_size(unsigned char **datap,
71
unsigned char *top) nogil
72
unsigned long sizeof_delta_index(delta_index *index)
73
56
Py_ssize_t DELTA_SIZE_MIN
74
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
75
int get_entry_summary(delta_index *index, int pos,
76
unsigned int *global_offset, unsigned int *hash_val)
77
unsigned int rabin_hash (unsigned char *data)
57
void *patch_delta(void *src_buf, unsigned long src_size,
58
void *delta_buf, unsigned long delta_size,
59
unsigned long *dst_size)
80
62
cdef void *safe_malloc(size_t count) except NULL:
104
86
return DeltaIndex(source)
107
cdef object _translate_delta_failure(delta_result result):
108
if result == DELTA_OUT_OF_MEMORY:
109
return MemoryError("Delta function failed to allocate memory")
110
elif result == DELTA_INDEX_NEEDED:
111
return ValueError("Delta function requires delta_index param")
112
elif result == DELTA_SOURCE_EMPTY:
113
return ValueError("Delta function given empty source_info param")
114
elif result == DELTA_SOURCE_BAD:
115
return RuntimeError("Delta function given invalid source_info param")
116
elif result == DELTA_BUFFER_EMPTY:
117
return ValueError("Delta function given empty buffer params")
118
return AssertionError("Unrecognised delta result code: %d" % result)
121
def _rabin_hash(content):
122
if not PyString_CheckExact(content):
123
raise ValueError('content must be a string')
124
if len(content) < 16:
125
raise ValueError('content must be at least 16 bytes long')
126
# Try to cast it to an int, if it can fit
127
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
130
89
cdef class DeltaIndex:
132
91
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
135
94
cdef readonly object _sources
136
95
cdef source_info *_source_infos
137
96
cdef delta_index *_index
97
cdef readonly unsigned int _max_num_sources
138
98
cdef public unsigned long _source_offset
139
cdef readonly unsigned int _max_num_sources
140
cdef public int _max_bytes_to_index
142
def __init__(self, source=None, max_bytes_to_index=None):
100
def __init__(self, source=None):
143
101
self._sources = []
144
102
self._index = NULL
145
103
self._max_num_sources = 65000
146
104
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
147
105
* self._max_num_sources)
148
106
self._source_offset = 0
149
self._max_bytes_to_index = 0
150
if max_bytes_to_index is not None:
151
self._max_bytes_to_index = max_bytes_to_index
153
108
if source is not None:
154
109
self.add_source(source, 0)
156
def __sizeof__(self):
157
# We want to track the _source_infos allocations, but the referenced
158
# void* are actually tracked in _sources itself.
159
# XXX: Cython is capable of doing sizeof(class) and returning the size
160
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
161
# to do it manually. *sigh* Note that we might get it wrong
162
# because of alignment issues.
164
# PyObject start, vtable *, 3 object pointers, 2 C ints
165
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
166
+ sizeof(unsigned long)
167
+ sizeof(unsigned int))
168
+ (sizeof(source_info) * self._max_num_sources)
169
+ sizeof_delta_index(self._index))
172
111
def __repr__(self):
173
112
return '%s(%d, %d)' % (self.__class__.__name__,
174
113
len(self._sources), self._source_offset)
182
121
def _has_index(self):
183
122
return (self._index != NULL)
185
def _dump_index(self):
186
"""Dump the pointers in the index.
188
This is an arbitrary layout, used for testing. It is not meant to be
189
used in production code.
191
:return: (hash_list, entry_list)
192
hash_list A list of offsets, so hash[i] points to the 'hash
193
bucket' starting at the given offset and going until
195
entry_list A list of (text_offset, hash_val). text_offset is the
196
offset in the "source" texts, and hash_val is the RABIN
197
hash for that offset.
198
Note that the entry should be in the hash bucket
200
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
203
cdef unsigned int text_offset
204
cdef unsigned int hash_val
205
cdef unsigned int hash_offset
206
if self._index == NULL:
210
while get_hash_offset(self._index, pos, &hash_offset):
211
hash_list.append(int(hash_offset))
215
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
216
# Map back using 'int' so that we don't get Long everywhere, when
217
# almost everything is <2**31.
218
val = tuple(map(int, [text_offset, hash_val]))
219
entry_list.append(val)
221
return hash_list, entry_list
223
124
def add_delta_source(self, delta, unadded_bytes):
224
125
"""Add a new delta to the source texts.
248
148
src.buf = c_delta
249
149
src.size = c_delta_size
250
150
src.agg_offset = self._source_offset + unadded_bytes
252
res = create_delta_index_from_delta(src, self._index, &index)
254
raise _translate_delta_failure(res)
151
index = create_delta_index_from_delta(src, self._index)
255
152
self._source_offset = src.agg_offset + src.size
256
if index != self._index:
257
154
free_delta_index(self._index)
258
155
self._index = index
263
160
:param source: The text in question, this must be a byte string
264
161
:param unadded_bytes: Assume there are this many bytes that didn't get
265
162
added between this source and the end of the previous source.
266
:param max_pointers: Add no more than this many entries to the index.
267
By default, we sample every 16 bytes, if that would require more
268
than max_entries, we will reduce the sampling rate.
269
A value of 0 means unlimited, None means use the default limit.
271
164
cdef char *c_source
272
165
cdef Py_ssize_t c_source_size
273
166
cdef delta_index *index
274
cdef delta_result res
275
167
cdef unsigned int source_location
276
168
cdef source_info *src
277
169
cdef unsigned int num_indexes
278
cdef int max_num_entries
280
171
if not PyString_CheckExact(source):
281
172
raise TypeError('source is not a str')
297
188
self._source_offset = src.agg_offset + src.size
298
189
# We delay creating the index on the first insert
299
190
if source_location != 0:
301
res = create_delta_index(src, self._index, &index,
302
self._max_bytes_to_index)
304
raise _translate_delta_failure(res)
305
if index != self._index:
191
index = create_delta_index(src, self._index)
306
193
free_delta_index(self._index)
307
194
self._index = index
309
196
cdef _populate_first_index(self):
310
197
cdef delta_index *index
311
cdef delta_result res
312
198
if len(self._sources) != 1 or self._index != NULL:
313
199
raise AssertionError('_populate_first_index should only be'
314
200
' called when we have a single source and no index yet')
316
# We know that self._index is already NULL, so create_delta_index
317
# will always create a new index unless there's a malloc failure
319
res = create_delta_index(&self._source_infos[0], NULL, &index,
320
self._max_bytes_to_index)
322
raise _translate_delta_failure(res)
202
# We know that self._index is already NULL, so whatever
203
# create_delta_index returns is fine
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
325
207
cdef _expand_sources(self):
326
208
raise RuntimeError('if we move self._source_infos, then we need to'
354
234
# TODO: inline some of create_delta so we at least don't have to double
355
235
# malloc, and can instead use PyString_FromStringAndSize, to
356
236
# allocate the bytes into the final string
357
c_max_delta_size = max_delta_size
359
res = create_delta(self._index, target, target_size,
360
&delta_size, c_max_delta_size, &delta)
237
delta = create_delta(self._index,
239
&delta_size, max_delta_size)
363
242
result = PyString_FromStringAndSize(<char *>delta, delta_size)
365
elif res != DELTA_SIZE_TOO_BIG:
366
raise _translate_delta_failure(res)
401
278
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
402
unsigned char cmd, unsigned int *offset,
403
unsigned int *length) nogil: # cannot_raise
279
unsigned char cmd, unsigned int *offset, unsigned int *length):
404
280
"""Decode a copy instruction from the next few bytes.
406
282
A copy instruction is a variable number of bytes, so we will parse the
460
335
result = PyString_FromStringAndSize(NULL, size)
461
336
dst_buf = <unsigned char*>PyString_AS_STRING(result)
471
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
472
if (cp_off + cp_size < cp_size or
473
cp_off + cp_size > <unsigned int>source_size or
474
cp_size > <unsigned int>size):
477
memcpy(out, source + cp_off, cp_size)
479
size = size - cp_size
483
# cmd == 0 is reserved for future encoding
484
# extensions. In the mean time we must fail when
485
# encountering them (might be data corruption).
491
memcpy(out, data, cmd)
497
raise ValueError('Something wrong with:'
498
' cp_off = %s, cp_size = %s'
499
' source_size = %s, size = %s'
500
% (cp_off, cp_size, source_size, size))
502
raise ValueError('Got delta opcode: 0, not supported')
504
raise ValueError('Insert instruction longer than remaining'
505
' bytes: %d > %d' % (cmd, size))
344
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
345
if (cp_off + cp_size < cp_size or
346
cp_off + cp_size > source_size or
348
raise RuntimeError('Something wrong with:'
349
' cp_off = %s, cp_size = %s'
350
' source_size = %s, size = %s'
351
% (cp_off, cp_size, source_size, size))
352
memcpy(out, source + cp_off, cp_size)
354
size = size - cp_size
358
# cmd == 0 is reserved for future encoding
359
# extensions. In the mean time we must fail when
360
# encountering them (might be data corruption).
361
raise RuntimeError('Got delta opcode: 0, not supported')
363
raise RuntimeError('Insert instruction longer than remaining'
364
' bytes: %d > %d' % (cmd, size))
365
memcpy(out, data, cmd)
508
371
if (data != top or size != 0):