85
def make_delta_index(source):
86
return DeltaIndex(source)
89
cdef class DeltaIndex:
91
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
92
# isn't performance critical
93
# cdef readonly list _sources
94
cdef readonly object _sources
95
cdef source_info *_source_infos
96
cdef delta_index *_index
97
cdef public unsigned long _source_offset
98
cdef readonly unsigned int _max_num_sources
100
def __init__(self, source=None):
103
self._max_num_sources = 65000
104
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
105
* self._max_num_sources)
106
self._source_offset = 0
108
if source is not None:
109
self.add_source(source, 0)
111
def __sizeof__(self):
112
# We want to track the _source_infos allocations, but the referenced
113
# void* are actually tracked in _sources itself.
114
# XXX: Cython is capable of doing sizeof(class) and returning the size
115
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
116
# to do it manually. *sigh* Note that we might get it wrong
117
# because of alignment issues.
119
# PyObject start, vtable *, 3 object pointers, 2 C ints
120
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
121
+ sizeof(unsigned long)
122
+ sizeof(unsigned int))
123
+ (sizeof(source_info) * self._max_num_sources)
124
+ sizeof_delta_index(self._index))
128
return '%s(%d, %d)' % (self.__class__.__name__,
129
len(self._sources), self._source_offset)
86
cdef class EquivalenceTable:
87
"""This tracks equivalencies between lists of hashable objects.
89
:ivar lines: The 'static' lines that will be preserved between runs.
92
cdef readonly object lines
93
cdef readonly object _right_lines
94
cdef Py_ssize_t _hashtable_size
95
cdef Py_ssize_t _hashtable_bitmask
96
cdef _hash_bucket *_hashtable
97
cdef _raw_line *_raw_lines
98
cdef Py_ssize_t _len_lines
100
def __init__(self, lines):
101
self.lines = list(lines)
102
self._right_lines = None
103
self._hashtable_size = 0
104
self._hashtable = NULL
105
self._raw_lines = NULL
107
self._lines_to_raw_lines(lines)
108
self._build_hash_table()
131
110
def __dealloc__(self):
132
if self._index != NULL:
133
free_delta_index(self._index)
135
safe_free(<void **>&self._source_infos)
137
def _has_index(self):
138
return (self._index != NULL)
140
def add_delta_source(self, delta, unadded_bytes):
141
"""Add a new delta to the source texts.
143
:param delta: The text of the delta, this must be a byte string.
144
:param unadded_bytes: Number of bytes that were added to the source
145
that were not indexed.
148
cdef Py_ssize_t c_delta_size
149
cdef delta_index *index
150
cdef unsigned int source_location
151
cdef source_info *src
152
cdef unsigned int num_indexes
154
if not PyString_CheckExact(delta):
155
raise TypeError('delta is not a str')
157
source_location = len(self._sources)
158
if source_location >= self._max_num_sources:
159
self._expand_sources()
160
self._sources.append(delta)
161
c_delta = PyString_AS_STRING(delta)
162
c_delta_size = PyString_GET_SIZE(delta)
163
src = self._source_infos + source_location
165
src.size = c_delta_size
166
src.agg_offset = self._source_offset + unadded_bytes
168
index = create_delta_index_from_delta(src, self._index)
169
self._source_offset = src.agg_offset + src.size
171
free_delta_index(self._index)
174
def add_source(self, source, unadded_bytes):
175
"""Add a new bit of source text to the delta indexes.
177
:param source: The text in question, this must be a byte string
178
:param unadded_bytes: Assume there are this many bytes that didn't get
179
added between this source and the end of the previous source.
182
cdef Py_ssize_t c_source_size
183
cdef delta_index *index
184
cdef unsigned int source_location
185
cdef source_info *src
186
cdef unsigned int num_indexes
188
if not PyString_CheckExact(source):
189
raise TypeError('source is not a str')
191
source_location = len(self._sources)
192
if source_location >= self._max_num_sources:
193
self._expand_sources()
194
if source_location != 0 and self._index == NULL:
195
# We were lazy about populating the index, create it now
196
self._populate_first_index()
197
self._sources.append(source)
198
c_source = PyString_AS_STRING(source)
199
c_source_size = PyString_GET_SIZE(source)
200
src = self._source_infos + source_location
202
src.size = c_source_size
204
src.agg_offset = self._source_offset + unadded_bytes
205
self._source_offset = src.agg_offset + src.size
206
# We delay creating the index on the first insert
207
if source_location != 0:
209
index = create_delta_index(src, self._index)
211
free_delta_index(self._index)
214
cdef _populate_first_index(self):
215
cdef delta_index *index
216
if len(self._sources) != 1 or self._index != NULL:
217
raise AssertionError('_populate_first_index should only be'
218
' called when we have a single source and no index yet')
220
# We know that self._index is already NULL, so whatever
221
# create_delta_index returns is fine
223
self._index = create_delta_index(&self._source_infos[0], NULL)
224
assert self._index != NULL
226
cdef _expand_sources(self):
227
raise RuntimeError('if we move self._source_infos, then we need to'
228
' change all of the index pointers as well.')
229
self._max_num_sources = self._max_num_sources * 2
230
self._source_infos = <source_info *>safe_realloc(self._source_infos,
232
* self._max_num_sources)
234
def make_delta(self, target_bytes, max_delta_size=0):
235
"""Create a delta from the current source to the target bytes."""
237
cdef Py_ssize_t target_size
239
cdef unsigned long delta_size
240
cdef unsigned long c_max_delta_size
242
if self._index == NULL:
243
if len(self._sources) == 0:
111
safe_free(<void**>&self._hashtable)
112
safe_free(<void**>&self._raw_lines)
114
cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
115
"""Convert a single PyObject into a raw line."""
116
raw_line.hash = PyObject_Hash(line)
117
raw_line.next_line_index = SENTINEL
118
raw_line.hash_offset = SENTINEL
119
raw_line.flags = INDEXED
122
cdef int _lines_to_raw_lines(self, object lines) except -1:
123
"""Load a sequence of objects into the _raw_line format"""
124
cdef Py_ssize_t count, i
129
# Do we want to use PySequence_Fast, or just assume that it is a list
130
# Also, do we need to decref the return value?
131
# http://www.python.org/doc/current/api/sequence.html
132
seq = PySequence_Fast(lines, "expected a sequence")
134
count = PySequence_Fast_GET_SIZE(seq)
137
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
138
safe_free(<void**>&self._raw_lines)
139
self._raw_lines = raw
140
self._len_lines = count
142
for i from 0 <= i < count:
143
item = PySequence_Fast_GET_ITEM(seq, i)
144
# NB: We don't Py_INCREF the data pointer, because we know we
145
# maintain a pointer to the item in self.lines or
147
# TODO: Do we even need to track a data pointer here? It would
148
# be less memory, and we *could* just look it up in the
149
# appropriate line list.
150
self._line_to_raw_line(item, &raw[i])
152
# TODO: Unfortunately, try/finally always generates a compiler
153
# warning about a possibly unused variable :(
157
cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
158
"""Determine the smallest hash size that can reasonably fit the data.
160
:param needed: The number of entries we might be inserting into
161
the hash (assuming there are no duplicate lines.)
162
:return: The number of hash table entries to use. This will always be a
165
cdef Py_ssize_t hash_size
166
cdef Py_ssize_t min_size
168
# TODO: Another alternative would be to actually count how full the
169
# hash-table is, and decide if we need to grow it based on
170
# density. That can take into account duplicated lines. Though if
171
# we compress well, there should be a minimal amount of
172
# duplicated lines in the output.
174
# At the bare minimum, we could fit all entries into a 'needed'
175
# size hash table. However, any collision would then have a long way to
176
# traverse before it could find a 'free' slot.
177
# So we set the minimum size to give us 33% empty slots.
178
min_size = <Py_ssize_t>(needed * 2)
180
while hash_size < min_size:
181
hash_size = hash_size << 1
184
def _py_compute_minimum_hash_size(self, needed):
185
"""Expose _compute_minimum_hash_size to python for testing."""
186
return self._compute_minimum_hash_size(needed)
188
cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
189
"""Determine a reasonable hash size, assuming some room for growth.
191
:param needed: The number of entries we might be inserting into
192
the hash (assuming there are no duplicate lines.)
193
:return: The number of hash table entries to use. This will always be a
196
cdef Py_ssize_t hash_size
197
cdef Py_ssize_t min_size
199
# We start off with a 8k hash (after doubling), because there isn't a
200
# lot of reason to go smaller than that (this class isn't one you'll be
201
# instantiating thousands of, and you are always likely to grow here.)
203
while hash_size < needed:
204
hash_size = hash_size << 1
205
# And we always give at least 4x blank space
206
hash_size = hash_size << 2
209
def _py_compute_recommended_hash_size(self, needed):
210
"""Expose _compute_recommended_hash_size to python for testing."""
211
return self._compute_recommended_hash_size(needed)
213
cdef int _build_hash_table(self) except -1:
214
"""Build the hash table 'from scratch'."""
215
cdef Py_ssize_t hash_size
216
cdef Py_ssize_t hash_bitmask
218
cdef _raw_line *cur_line
219
cdef Py_ssize_t hash_offset
220
cdef _hash_bucket *cur_bucket
221
cdef _hash_bucket *new_hashtable
223
# Hash size is a power of 2
224
hash_size = self._compute_recommended_hash_size(self._len_lines)
226
new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
228
safe_free(<void**>&self._hashtable)
229
self._hashtable = new_hashtable
231
self._hashtable_size = hash_size
232
for i from 0 <= i < hash_size:
233
self._hashtable[i].line_index = SENTINEL
234
self._hashtable[i].count = 0
236
# Turn the hash size into a bitmask
237
self._hashtable_bitmask = hash_size - 1
239
# Iterate backwards, because it makes it easier to insert items into
240
# the hash (you just change the head pointer, and everything else keeps
241
# pointing to the same location).
242
for i from self._len_lines > i >= 0:
243
cur_line = self._raw_lines + i
244
if not (cur_line.flags & INDEXED == INDEXED):
246
hash_offset = self._find_hash_position(cur_line)
248
# Point this line to the location in the hash table
249
cur_line.hash_offset = hash_offset
250
# And make this line the head of the hash table
251
cur_bucket = self._hashtable + hash_offset
252
cur_line.next_line_index = cur_bucket.line_index
253
cur_bucket.line_index = i
254
cur_bucket.count = cur_bucket.count + 1
256
cdef int _extend_raw_lines(self, object index) except -1:
257
"""Add the last N lines from self.lines into the raw_lines array."""
258
cdef Py_ssize_t new_count
259
cdef Py_ssize_t new_total_len
260
cdef Py_ssize_t old_len
262
cdef PyObject *should_index
264
cdef Py_ssize_t line_index
265
cdef _raw_line *cur_line
266
cdef PyObject *seq_index
267
cdef Py_ssize_t actual_new_len
269
seq_index = PySequence_Fast(index, "expected a sequence for index")
271
old_len = self._len_lines
272
new_count = PySequence_Fast_GET_SIZE(seq_index)
273
new_total_len = new_count + self._len_lines
275
while actual_new_len < new_total_len:
276
actual_new_len = actual_new_len << 1
277
self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
278
actual_new_len * sizeof(_raw_line))
279
self._len_lines = new_total_len
280
# Now that we have enough space, start adding the new lines
281
# into the array. These are done in forward order.
282
for i from 0 <= i < new_count:
283
line_index = i + old_len
284
cur_line = self._raw_lines + line_index
285
item = PyList_GET_ITEM(self.lines, line_index)
286
self._line_to_raw_line(item, cur_line)
287
should_index = PySequence_Fast_GET_ITEM(seq_index, i)
288
if PyObject_Not(should_index):
289
cur_line.flags = cur_line.flags & ~(<int>INDEXED)
293
cdef int _extend_hash_table(self, Py_ssize_t count) except -1:
294
"""Add the last N entries in self.lines to the hash table.
296
:param index: A sequence that declares whether each node should be
299
cdef Py_ssize_t line_index
300
cdef Py_ssize_t start
301
cdef _raw_line *cur_line
302
cdef _raw_line *next_line
303
cdef Py_ssize_t hash_offset
304
cdef _hash_bucket *cur_bucket
306
start = self._len_lines - count
308
for line_index from start <= line_index < self._len_lines:
309
cur_line = self._raw_lines + line_index
310
if cur_line.flags & INDEXED != INDEXED:
312
hash_offset = self._find_hash_position(cur_line)
314
# Point this line to the location in the hash table
315
cur_line.hash_offset = hash_offset
317
# Make this line the tail of the hash table
318
cur_bucket = self._hashtable + hash_offset
319
cur_bucket.count = cur_bucket.count + 1
320
if cur_bucket.line_index == SENTINEL:
321
cur_bucket.line_index = line_index
323
# We need to track through the pointers and insert this at
325
next_line = self._raw_lines + cur_bucket.line_index
326
while next_line.next_line_index != SENTINEL:
327
next_line = self._raw_lines + next_line.next_line_index
328
next_line.next_line_index = line_index
330
cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
331
"""Find the node in the hash which defines this line.
333
Each bucket in the hash table points at exactly 1 equivalent line. If 2
334
objects would collide, we just increment to the next bucket until we
335
get to an empty bucket that is either empty or exactly matches this
338
:return: The offset in the hash table for this entry
340
cdef Py_ssize_t location
341
cdef _raw_line *ref_line
342
cdef Py_ssize_t ref_index
343
cdef int compare_result
345
location = line.hash & self._hashtable_bitmask
346
ref_index = self._hashtable[location].line_index
347
while ref_index != SENTINEL:
348
ref_line = self._raw_lines + ref_index
349
if (ref_line.hash == line.hash):
350
PyObject_Cmp(ref_line.data, line.data, &compare_result)
351
if compare_result == 0:
353
location = (location + 1) & self._hashtable_bitmask
354
ref_index = self._hashtable[location].line_index
357
def _py_find_hash_position(self, line):
358
"""Used only for testing.
360
Return the location where this fits in the hash table
362
cdef _raw_line raw_line
364
self._line_to_raw_line(<PyObject *>line, &raw_line)
365
return self._find_hash_position(&raw_line)
367
def _inspect_left_lines(self):
368
"""Used only for testing.
370
:return: None if _raw_lines is NULL,
371
else [(object, hash_val, hash_loc, next_val)] for each node in raw
376
if self._raw_lines == NULL:
380
for i from 0 <= i < self._len_lines:
381
PyList_Append(result,
382
(<object>self._raw_lines[i].data,
383
self._raw_lines[i].hash,
384
self._raw_lines[i].hash_offset,
385
self._raw_lines[i].next_line_index,
389
def _inspect_hash_table(self):
390
"""Used only for testing.
392
This iterates the hash table, and returns 'interesting' entries.
393
:return: (total_size, [(offset, line_index, count)] for entries that
399
for i from 0 <= i < self._hashtable_size:
400
if self._hashtable[i].line_index != SENTINEL:
401
PyList_Append(interesting,
402
(i, self._hashtable[i].line_index,
403
self._hashtable[i].count))
404
return (self._hashtable_size, interesting)
406
def get_matches(self, line):
407
"""Return the lines which match the line in right."""
408
cdef Py_ssize_t count
410
cdef Py_ssize_t *matches
415
count = self.get_raw_matches(<PyObject *>line, &matches)
245
# We were just lazy about generating the index
246
self._populate_first_index()
248
if not PyString_CheckExact(target_bytes):
249
raise TypeError('target is not a str')
251
target = PyString_AS_STRING(target_bytes)
252
target_size = PyString_GET_SIZE(target_bytes)
254
# TODO: inline some of create_delta so we at least don't have to double
255
# malloc, and can instead use PyString_FromStringAndSize, to
256
# allocate the bytes into the final string
257
c_max_delta_size = max_delta_size
259
delta = create_delta(self._index,
261
&delta_size, c_max_delta_size)
264
result = PyString_FromStringAndSize(<char *>delta, delta_size)
419
for i from 0 <= i < count:
420
PyList_Append(result, matches[i])
422
safe_free(<void**>&matches)
269
def make_delta(source_bytes, target_bytes):
270
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
271
di = DeltaIndex(source_bytes)
272
return di.make_delta(target_bytes)
275
def apply_delta(source_bytes, delta_bytes):
276
"""Apply a delta generated by make_delta to source_bytes."""
278
cdef Py_ssize_t source_size
280
cdef Py_ssize_t delta_size
282
if not PyString_CheckExact(source_bytes):
283
raise TypeError('source is not a str')
284
if not PyString_CheckExact(delta_bytes):
285
raise TypeError('delta is not a str')
286
source = PyString_AS_STRING(source_bytes)
287
source_size = PyString_GET_SIZE(source_bytes)
288
delta = PyString_AS_STRING(delta_bytes)
289
delta_size = PyString_GET_SIZE(delta_bytes)
290
# Code taken from patch-delta.c, only brought here to give better error
291
# handling, and to avoid double allocating memory
292
if (delta_size < DELTA_SIZE_MIN):
293
# XXX: Invalid delta block
294
raise RuntimeError('delta_size %d smaller than min delta size %d'
295
% (delta_size, DELTA_SIZE_MIN))
297
return _apply_delta(source, source_size, delta, delta_size)
300
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
301
unsigned char cmd, unsigned int *offset,
302
unsigned int *length) nogil: # cannot_raise
303
"""Decode a copy instruction from the next few bytes.
305
A copy instruction is a variable number of bytes, so we will parse the
306
bytes we care about, and return the new position, as well as the offset and
307
length referred to in the bytes.
309
:param bytes: Pointer to the start of bytes after cmd
310
:param cmd: The command code
311
:return: Pointer to the bytes just after the last decode byte
425
cdef Py_ssize_t get_raw_matches(self, PyObject *line,
426
Py_ssize_t **matches) except -1:
427
"""Get all the possible entries that match the given location.
429
:param line: A Python line which we want to match
430
:param pos: The index position in _right_lines
431
:param matches: A variable to hold an array containing all the matches.
432
This will be allocated using malloc, and the caller is responsible
433
to free it with free(). If there are no matches, no memory will be
435
:retrun: The number of matched positions
437
cdef Py_ssize_t hash_offset
438
cdef _raw_line raw_line
439
cdef _hash_bucket cur_bucket
440
cdef Py_ssize_t cur_line_idx
441
cdef Py_ssize_t count
442
cdef Py_ssize_t *local_matches
444
self._line_to_raw_line(line, &raw_line)
445
hash_offset = self._find_hash_position(&raw_line)
446
cur_bucket = self._hashtable[hash_offset]
447
cur_line_idx = cur_bucket.line_index
448
if cur_line_idx == SENTINEL:
450
local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *
451
(cur_bucket.count + 1))
452
matches[0] = local_matches
453
for count from 0 <= count < cur_bucket.count:
454
assert cur_line_idx != SENTINEL
455
local_matches[count] = cur_line_idx
456
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
457
return cur_bucket.count
459
cdef Py_ssize_t get_next_matches(self, PyObject *line,
461
Py_ssize_t tip_count,
462
int *did_match) except -1:
463
"""Find matches that are interesting for the next step.
465
This finds offsets which match the current search tips. It is slightly
466
different from get_raw_matches, in that it intersects matching lines
467
against the existing tips, and then returns the *next* line.
469
TODO: It might be nice to know if there were 0 matches because the line
470
didn't match anything, or if it was just because it didn't follow
473
:param line: A line to match against
474
:param tips: A list of matches that we already have. This list
475
will be updated "in place". If there are no new matches, the list
476
will go unmodified, and the returned count will be 0.
477
:param tip_count: The current length of tips
478
:param did_match: Will be set to True if the line had matches, else 0
479
:return: The new number of matched lines.
481
cdef Py_ssize_t hash_offset
482
cdef _raw_line raw_line
483
cdef _hash_bucket cur_bucket
484
cdef Py_ssize_t cur_line_idx
485
cdef Py_ssize_t count
486
cdef Py_ssize_t *cur_tip
487
cdef Py_ssize_t *end_tip
488
cdef Py_ssize_t *cur_out
490
self._line_to_raw_line(line, &raw_line)
491
hash_offset = self._find_hash_position(&raw_line)
492
cur_bucket = self._hashtable[hash_offset]
493
cur_line_idx = cur_bucket.line_index
494
if cur_line_idx == SENTINEL:
501
end_tip = tips + tip_count
504
while cur_tip < end_tip and cur_line_idx != SENTINEL:
505
if cur_line_idx == cur_tip[0]:
506
# These match, so put a new entry in the output
507
# We put the *next* line, so that this is ready to pass back in
508
# for the next search.
509
cur_out[0] = (cur_line_idx + 1)
511
cur_out = cur_out + 1
512
cur_tip = cur_tip + 1
513
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
514
elif cur_line_idx < cur_tip[0]:
515
# cur_line_idx is "behind"
516
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
518
cur_tip = cur_tip + 1
521
def _get_matching_lines(self):
522
"""Return a dictionary showing matching lines."""
524
for line in self.lines:
525
matching[line] = self.get_matches(line)
528
def get_idx_matches(self, right_idx):
529
"""Return the left lines matching the right line at the given offset."""
530
line = self._right_lines[right_idx]
531
return self.get_matches(line)
533
def extend_lines(self, lines, index):
534
"""Add more lines to the left-lines list.
536
:param lines: A list of lines to add
537
:param index: A True/False for each node to define if it should be
540
cdef Py_ssize_t orig_len
541
cdef Py_ssize_t min_new_hash_size
542
assert len(lines) == len(index)
543
min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
545
orig_len = len(self.lines)
546
self.lines.extend(lines)
547
self._extend_raw_lines(index)
548
if self._hashtable_size >= min_new_hash_size:
549
# Just add the new lines, don't bother resizing the hash table
550
self._extend_hash_table(len(index))
552
self._build_hash_table()
554
def set_right_lines(self, lines):
555
"""Set the lines we will be matching against."""
556
self._right_lines = lines
559
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,
560
Py_ssize_t *new_values, Py_ssize_t new_count) except -1:
561
"""Intersect the base values with the new values.
563
It is assumed that both base and new_values are sorted in ascending order,
566
The values in 'base' will be modified to only contain the entries that are
567
in both base and new. If nothing matches, 'base' will not be modified
569
:param base: An array of base values
570
:param base_count: The length of the base array
571
:param new_values: An array of values to compare to base, will be modified
572
to only contain matching entries
573
:param new_count: The number of new entries
574
:return: The count of matching values
313
cdef unsigned int off, size, count
576
cdef Py_ssize_t *cur_base
577
cdef Py_ssize_t *cur_new
578
cdef Py_ssize_t *cur_out
579
cdef Py_ssize_t count
580
cdef Py_ssize_t *base_end
581
cdef Py_ssize_t *new_end
586
base_end = base + base_count
587
new_end = new_values + new_count
321
off = off | (bytes[count] << 8)
324
off = off | (bytes[count] << 16)
327
off = off | (bytes[count] << 24)
333
size = size | (bytes[count] << 8)
336
size = size | (bytes[count] << 16)
345
cdef object _apply_delta(char *source, Py_ssize_t source_size,
346
char *delta, Py_ssize_t delta_size):
347
"""common functionality between apply_delta and apply_delta_to_source."""
348
cdef unsigned char *data, *top
349
cdef unsigned char *dst_buf, *out, cmd
590
while cur_base < base_end and cur_new < new_end:
591
if cur_base[0] == cur_new[0]:
592
# This may be assigning to self.
593
cur_out[0] = cur_base[0]
595
cur_out = cur_out + 1
596
# We matched, so move both pointers
597
cur_base = cur_base + 1
598
cur_new = cur_new + 1
599
elif cur_base[0] < cur_new[0]:
600
# cur_base is "behind" so increment it
601
cur_base = cur_base + 1
603
# cur_new must be "behind" move its pointer
604
cur_new = cur_new + 1
608
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):
611
for i from 0 <= i < count:
612
PyList_Append(lst, array[i])
616
cdef Py_ssize_t list_as_array(object lst, Py_ssize_t **array) except -1:
617
"""Convert a Python sequence to an integer array.
619
:return: The size of the output, -1 on error
350
623
cdef Py_ssize_t size
351
cdef unsigned int cp_off, cp_size
354
data = <unsigned char *>delta
355
top = data + delta_size
357
# now the result size
358
size = get_delta_hdr_size(&data, top)
359
result = PyString_FromStringAndSize(NULL, size)
360
dst_buf = <unsigned char*>PyString_AS_STRING(result)
370
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
371
if (cp_off + cp_size < cp_size or
372
cp_off + cp_size > source_size or
376
memcpy(out, source + cp_off, cp_size)
378
size = size - cp_size
625
seq = PySequence_Fast(lst, "expected a sequence for lst")
627
size = PySequence_Fast_GET_SIZE(seq)
628
array[0] = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * size)
629
for i from 0 <= i < size:
630
array[0][i] = <object>PySequence_Fast_GET_ITEM(seq, i)
636
def _get_longest_match(equivalence_table, pos, max_pos, locations):
637
"""Get the longest possible match for the current position."""
638
cdef EquivalenceTable eq
640
cdef Py_ssize_t cmax_pos
641
cdef Py_ssize_t *matches
642
cdef Py_ssize_t match_count
644
cdef Py_ssize_t *copy_ends
645
cdef Py_ssize_t copy_count
646
cdef Py_ssize_t range_len
647
cdef Py_ssize_t in_start
649
cdef Py_ssize_t intersect_count
652
eq = equivalence_table
662
# TODO: Handle when locations is not None
663
# if locations is not None:
664
# copy_count = list_as_array(locations, ©_ends)
665
# # We know the match for this position
668
while cpos < cmax_pos:
669
# TODO: Instead of grabbing the full list of matches and then doing an
670
# intersection, use a helper that does the intersection
672
line = PyList_GET_ITEM(eq._right_lines, cpos)
673
if copy_ends == NULL:
674
copy_count = eq.get_raw_matches(line, ©_ends)
676
# No matches for this line
679
# point at the next location, for the next round trip
681
for i from 0 <= i < copy_count:
682
copy_ends[i] = copy_ends[i] + 1
684
# We are in the middle of a match
685
intersect_count = eq.get_next_matches(line, copy_ends, copy_count,
687
if intersect_count == 0:
690
# because the next line is not interesting
382
# cmd == 0 is reserved for future encoding
383
# extensions. In the mean time we must fail when
384
# encountering them (might be data corruption).
390
memcpy(out, data, cmd)
396
raise ValueError('Something wrong with:'
397
' cp_off = %s, cp_size = %s'
398
' source_size = %s, size = %s'
399
% (cp_off, cp_size, source_size, size))
401
raise ValueError('Got delta opcode: 0, not supported')
403
raise ValueError('Insert instruction longer than remaining'
404
' bytes: %d > %d' % (cmd, size))
407
if (data != top or size != 0):
408
raise RuntimeError('Did not extract the number of bytes we expected'
409
' we were left with %d bytes in "size", and top - data = %d'
410
% (size, <int>(top - data)))
413
# *dst_size = out - dst_buf;
414
if (out - dst_buf) != PyString_GET_SIZE(result):
415
raise RuntimeError('Number of bytes extracted did not match the'
416
' size encoded in the delta header.')
420
def apply_delta_to_source(source, delta_start, delta_end):
421
"""Extract a delta from source bytes, and apply it."""
423
cdef Py_ssize_t c_source_size
425
cdef Py_ssize_t c_delta_size
426
cdef Py_ssize_t c_delta_start, c_delta_end
428
if not PyString_CheckExact(source):
429
raise TypeError('source is not a str')
430
c_source_size = PyString_GET_SIZE(source)
431
c_delta_start = delta_start
432
c_delta_end = delta_end
433
if c_delta_start >= c_source_size:
434
raise ValueError('delta starts after source')
435
if c_delta_end > c_source_size:
436
raise ValueError('delta ends after source')
437
if c_delta_start >= c_delta_end:
438
raise ValueError('delta starts after it ends')
440
c_delta_size = c_delta_end - c_delta_start
441
c_source = PyString_AS_STRING(source)
442
c_delta = c_source + c_delta_start
443
# We don't use source_size, because we know the delta should not refer to
444
# any bytes after it starts
445
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
448
def encode_base128_int(val):
449
"""Convert an integer into a 7-bit lsb encoding."""
450
cdef unsigned int c_val
451
cdef Py_ssize_t count
452
cdef unsigned int num_bytes
453
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
457
while c_val >= 0x80 and count < 8:
458
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
461
if count >= 8 or c_val >= 0x80:
462
raise ValueError('encode_base128_int overflowed the buffer')
463
c_bytes[count] = <unsigned char>(c_val & 0xFF)
465
return PyString_FromStringAndSize(<char *>c_bytes, count)
468
def decode_base128_int(bytes):
469
"""Decode an integer from a 7-bit lsb encoding."""
472
cdef unsigned int uval
474
cdef Py_ssize_t num_low_bytes
475
cdef unsigned char *c_bytes
480
if not PyString_CheckExact(bytes):
481
raise TypeError('bytes is not a string')
482
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
483
# We take off 1, because we have to be able to decode the non-expanded byte
484
num_low_bytes = PyString_GET_SIZE(bytes) - 1
485
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
486
val = val | ((c_bytes[offset] & 0x7F) << shift)
489
if c_bytes[offset] & 0x80:
490
raise ValueError('Data not properly formatted, we ran out of'
491
' bytes before 0x80 stopped being set.')
492
val = val | (c_bytes[offset] << shift)
495
uval = <unsigned int> val
694
copy_count = intersect_count
695
range_len = range_len + 1
697
if copy_ends == NULL or copy_count == 0:
698
# We never had a match
699
return None, cpos, None
700
# Because copy_ends is always sorted, we know the 'min' is just the first
702
in_start = copy_ends[0]
703
safe_free(<void**>©_ends)
704
return ((in_start - range_len), range_start, range_len), cpos, None