1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
|
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
"""Compiled extensions for doing compression."""
#python2.4 support
cdef extern from "python-compat.h":
pass
cdef extern from "Python.h":
ctypedef struct PyObject:
pass
ctypedef int Py_ssize_t # Required for older pyrex versions
int PyString_CheckExact(object)
char * PyString_AS_STRING(object)
Py_ssize_t PyString_GET_SIZE(object)
object PyString_FromStringAndSize(char *, Py_ssize_t)
cdef extern from *:
ctypedef unsigned long size_t
void * malloc(size_t) nogil
void * realloc(void *, size_t) nogil
void free(void *) nogil
void memcpy(void *, void *, size_t) nogil
cdef extern from "delta.h":
struct source_info:
void *buf
unsigned long size
unsigned long agg_offset
struct delta_index:
pass
ctypedef enum delta_result:
DELTA_OK
DELTA_OUT_OF_MEMORY
DELTA_INDEX_NEEDED
DELTA_SOURCE_EMPTY
DELTA_SOURCE_BAD
DELTA_BUFFER_EMPTY
DELTA_SIZE_TOO_BIG
delta_result create_delta_index(source_info *src,
delta_index *old,
delta_index **fresh,
int max_entries) nogil
delta_result create_delta_index_from_delta(source_info *delta,
delta_index *old,
delta_index **fresh) nogil
void free_delta_index(delta_index *index) nogil
delta_result create_delta(delta_index *indexes,
void *buf, unsigned long bufsize,
unsigned long *delta_size,
unsigned long max_delta_size,
void **delta_data) nogil
unsigned long get_delta_hdr_size(unsigned char **datap,
unsigned char *top) nogil
unsigned long sizeof_delta_index(delta_index *index)
Py_ssize_t DELTA_SIZE_MIN
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
int get_entry_summary(delta_index *index, int pos,
unsigned int *global_offset, unsigned int *hash_val)
unsigned int rabin_hash (unsigned char *data)
cdef void *safe_malloc(size_t count) except NULL:
cdef void *result
result = malloc(count)
if result == NULL:
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
return result
cdef void *safe_realloc(void * old, size_t count) except NULL:
cdef void *result
result = realloc(old, count)
if result == NULL:
raise MemoryError('Failed to reallocate to %d bytes of memory'
% (count,))
return result
cdef int safe_free(void **val) except -1:
assert val != NULL
if val[0] != NULL:
free(val[0])
val[0] = NULL
def make_delta_index(source):
return DeltaIndex(source)
cdef object _translate_delta_failure(delta_result result):
if result == DELTA_OUT_OF_MEMORY:
return MemoryError("Delta function failed to allocate memory")
elif result == DELTA_INDEX_NEEDED:
return ValueError("Delta function requires delta_index param")
elif result == DELTA_SOURCE_EMPTY:
return ValueError("Delta function given empty source_info param")
elif result == DELTA_SOURCE_BAD:
return RuntimeError("Delta function given invalid source_info param")
elif result == DELTA_BUFFER_EMPTY:
return ValueError("Delta function given empty buffer params")
return AssertionError("Unrecognised delta result code: %d" % result)
def _rabin_hash(content):
if not PyString_CheckExact(content):
raise ValueError('content must be a string')
if len(content) < 16:
raise ValueError('content must be at least 16 bytes long')
# Try to cast it to an int, if it can fit
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
cdef class DeltaIndex:
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
# isn't performance critical
# cdef readonly list _sources
cdef readonly object _sources
cdef source_info *_source_infos
cdef delta_index *_index
cdef public unsigned long _source_offset
cdef readonly unsigned int _max_num_sources
cdef public int _max_bytes_to_index
def __init__(self, source=None, max_bytes_to_index=None):
self._sources = []
self._index = NULL
self._max_num_sources = 65000
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
* self._max_num_sources)
self._source_offset = 0
self._max_bytes_to_index = 0
if max_bytes_to_index is not None:
self._max_bytes_to_index = max_bytes_to_index
if source is not None:
self.add_source(source, 0)
def __sizeof__(self):
# We want to track the _source_infos allocations, but the referenced
# void* are actually tracked in _sources itself.
# XXX: Cython is capable of doing sizeof(class) and returning the size
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
# to do it manually. *sigh* Note that we might get it wrong
# because of alignment issues.
cdef Py_ssize_t size
# PyObject start, vtable *, 3 object pointers, 2 C ints
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
+ sizeof(unsigned long)
+ sizeof(unsigned int))
+ (sizeof(source_info) * self._max_num_sources)
+ sizeof_delta_index(self._index))
return size
def __repr__(self):
return '%s(%d, %d)' % (self.__class__.__name__,
len(self._sources), self._source_offset)
def __dealloc__(self):
if self._index != NULL:
free_delta_index(self._index)
self._index = NULL
safe_free(<void **>&self._source_infos)
def _has_index(self):
return (self._index != NULL)
def _dump_index(self):
"""Dump the pointers in the index.
This is an arbitrary layout, used for testing. It is not meant to be
used in production code.
:return: (hash_list, entry_list)
hash_list A list of offsets, so hash[i] points to the 'hash
bucket' starting at the given offset and going until
hash[i+1]
entry_list A list of (text_offset, hash_val). text_offset is the
offset in the "source" texts, and hash_val is the RABIN
hash for that offset.
Note that the entry should be in the hash bucket
defined by
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
"""
cdef int pos
cdef unsigned int text_offset
cdef unsigned int hash_val
cdef unsigned int hash_offset
if self._index == NULL:
return None
hash_list = []
pos = 0
while get_hash_offset(self._index, pos, &hash_offset):
hash_list.append(int(hash_offset))
pos += 1
entry_list = []
pos = 0
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
# Map back using 'int' so that we don't get Long everywhere, when
# almost everything is <2**31.
val = tuple(map(int, [text_offset, hash_val]))
entry_list.append(val)
pos += 1
return hash_list, entry_list
def add_delta_source(self, delta, unadded_bytes):
"""Add a new delta to the source texts.
:param delta: The text of the delta, this must be a byte string.
:param unadded_bytes: Number of bytes that were added to the source
that were not indexed.
"""
cdef char *c_delta
cdef Py_ssize_t c_delta_size
cdef delta_index *index
cdef delta_result res
cdef unsigned int source_location
cdef source_info *src
cdef unsigned int num_indexes
if not PyString_CheckExact(delta):
raise TypeError('delta is not a str')
source_location = len(self._sources)
if source_location >= self._max_num_sources:
self._expand_sources()
self._sources.append(delta)
c_delta = PyString_AS_STRING(delta)
c_delta_size = PyString_GET_SIZE(delta)
src = self._source_infos + source_location
src.buf = c_delta
src.size = c_delta_size
src.agg_offset = self._source_offset + unadded_bytes
with nogil:
res = create_delta_index_from_delta(src, self._index, &index)
if res != DELTA_OK:
raise _translate_delta_failure(res)
self._source_offset = src.agg_offset + src.size
if index != self._index:
free_delta_index(self._index)
self._index = index
def add_source(self, source, unadded_bytes):
"""Add a new bit of source text to the delta indexes.
:param source: The text in question, this must be a byte string
:param unadded_bytes: Assume there are this many bytes that didn't get
added between this source and the end of the previous source.
:param max_pointers: Add no more than this many entries to the index.
By default, we sample every 16 bytes, if that would require more
than max_entries, we will reduce the sampling rate.
A value of 0 means unlimited, None means use the default limit.
"""
cdef char *c_source
cdef Py_ssize_t c_source_size
cdef delta_index *index
cdef delta_result res
cdef unsigned int source_location
cdef source_info *src
cdef unsigned int num_indexes
cdef int max_num_entries
if not PyString_CheckExact(source):
raise TypeError('source is not a str')
source_location = len(self._sources)
if source_location >= self._max_num_sources:
self._expand_sources()
if source_location != 0 and self._index == NULL:
# We were lazy about populating the index, create it now
self._populate_first_index()
self._sources.append(source)
c_source = PyString_AS_STRING(source)
c_source_size = PyString_GET_SIZE(source)
src = self._source_infos + source_location
src.buf = c_source
src.size = c_source_size
src.agg_offset = self._source_offset + unadded_bytes
self._source_offset = src.agg_offset + src.size
# We delay creating the index on the first insert
if source_location != 0:
with nogil:
res = create_delta_index(src, self._index, &index,
self._max_bytes_to_index)
if res != DELTA_OK:
raise _translate_delta_failure(res)
if index != self._index:
free_delta_index(self._index)
self._index = index
cdef _populate_first_index(self):
cdef delta_index *index
cdef delta_result res
if len(self._sources) != 1 or self._index != NULL:
raise AssertionError('_populate_first_index should only be'
' called when we have a single source and no index yet')
# We know that self._index is already NULL, so create_delta_index
# will always create a new index unless there's a malloc failure
with nogil:
res = create_delta_index(&self._source_infos[0], NULL, &index,
self._max_bytes_to_index)
if res != DELTA_OK:
raise _translate_delta_failure(res)
self._index = index
cdef _expand_sources(self):
raise RuntimeError('if we move self._source_infos, then we need to'
' change all of the index pointers as well.')
self._max_num_sources = self._max_num_sources * 2
self._source_infos = <source_info *>safe_realloc(self._source_infos,
sizeof(source_info)
* self._max_num_sources)
def make_delta(self, target_bytes, max_delta_size=0):
"""Create a delta from the current source to the target bytes."""
cdef char *target
cdef Py_ssize_t target_size
cdef void * delta
cdef unsigned long delta_size
cdef unsigned long c_max_delta_size
cdef delta_result res
if self._index == NULL:
if len(self._sources) == 0:
return None
# We were just lazy about generating the index
self._populate_first_index()
if not PyString_CheckExact(target_bytes):
raise TypeError('target is not a str')
target = PyString_AS_STRING(target_bytes)
target_size = PyString_GET_SIZE(target_bytes)
# TODO: inline some of create_delta so we at least don't have to double
# malloc, and can instead use PyString_FromStringAndSize, to
# allocate the bytes into the final string
c_max_delta_size = max_delta_size
with nogil:
res = create_delta(self._index, target, target_size,
&delta_size, c_max_delta_size, &delta)
result = None
if res == DELTA_OK:
result = PyString_FromStringAndSize(<char *>delta, delta_size)
free(delta)
elif res != DELTA_SIZE_TOO_BIG:
raise _translate_delta_failure(res)
return result
def make_delta(source_bytes, target_bytes):
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
di = DeltaIndex(source_bytes)
return di.make_delta(target_bytes)
def apply_delta(source_bytes, delta_bytes):
"""Apply a delta generated by make_delta to source_bytes."""
cdef char *source
cdef Py_ssize_t source_size
cdef char *delta
cdef Py_ssize_t delta_size
if not PyString_CheckExact(source_bytes):
raise TypeError('source is not a str')
if not PyString_CheckExact(delta_bytes):
raise TypeError('delta is not a str')
source = PyString_AS_STRING(source_bytes)
source_size = PyString_GET_SIZE(source_bytes)
delta = PyString_AS_STRING(delta_bytes)
delta_size = PyString_GET_SIZE(delta_bytes)
# Code taken from patch-delta.c, only brought here to give better error
# handling, and to avoid double allocating memory
if (delta_size < DELTA_SIZE_MIN):
# XXX: Invalid delta block
raise RuntimeError('delta_size %d smaller than min delta size %d'
% (delta_size, DELTA_SIZE_MIN))
return _apply_delta(source, source_size, delta, delta_size)
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
unsigned char cmd, unsigned int *offset,
unsigned int *length) nogil: # cannot_raise
"""Decode a copy instruction from the next few bytes.
A copy instruction is a variable number of bytes, so we will parse the
bytes we care about, and return the new position, as well as the offset and
length referred to in the bytes.
:param bytes: Pointer to the start of bytes after cmd
:param cmd: The command code
:return: Pointer to the bytes just after the last decode byte
"""
cdef unsigned int off, size, count
off = 0
size = 0
count = 0
if (cmd & 0x01):
off = bytes[count]
count = count + 1
if (cmd & 0x02):
off = off | (bytes[count] << 8)
count = count + 1
if (cmd & 0x04):
off = off | (bytes[count] << 16)
count = count + 1
if (cmd & 0x08):
off = off | (bytes[count] << 24)
count = count + 1
if (cmd & 0x10):
size = bytes[count]
count = count + 1
if (cmd & 0x20):
size = size | (bytes[count] << 8)
count = count + 1
if (cmd & 0x40):
size = size | (bytes[count] << 16)
count = count + 1
if (size == 0):
size = 0x10000
offset[0] = off
length[0] = size
return bytes + count
cdef object _apply_delta(char *source, Py_ssize_t source_size,
char *delta, Py_ssize_t delta_size):
"""common functionality between apply_delta and apply_delta_to_source."""
cdef unsigned char *data, *top
cdef unsigned char *dst_buf, *out, cmd
cdef Py_ssize_t size
cdef unsigned int cp_off, cp_size
cdef int failed
data = <unsigned char *>delta
top = data + delta_size
# now the result size
size = get_delta_hdr_size(&data, top)
result = PyString_FromStringAndSize(NULL, size)
dst_buf = <unsigned char*>PyString_AS_STRING(result)
failed = 0
with nogil:
out = dst_buf
while (data < top):
cmd = data[0]
data = data + 1
if (cmd & 0x80):
# Copy instruction
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
if (cp_off + cp_size < cp_size or
cp_off + cp_size > <unsigned int>source_size or
cp_size > <unsigned int>size):
failed = 1
break
memcpy(out, source + cp_off, cp_size)
out = out + cp_size
size = size - cp_size
else:
# Insert instruction
if cmd == 0:
# cmd == 0 is reserved for future encoding
# extensions. In the mean time we must fail when
# encountering them (might be data corruption).
failed = 2
break
if cmd > size:
failed = 3
break
memcpy(out, data, cmd)
out = out + cmd
data = data + cmd
size = size - cmd
if failed:
if failed == 1:
raise ValueError('Something wrong with:'
' cp_off = %s, cp_size = %s'
' source_size = %s, size = %s'
% (cp_off, cp_size, source_size, size))
elif failed == 2:
raise ValueError('Got delta opcode: 0, not supported')
elif failed == 3:
raise ValueError('Insert instruction longer than remaining'
' bytes: %d > %d' % (cmd, size))
# sanity check
if (data != top or size != 0):
raise RuntimeError('Did not extract the number of bytes we expected'
' we were left with %d bytes in "size", and top - data = %d'
% (size, <int>(top - data)))
return None
# *dst_size = out - dst_buf;
if (out - dst_buf) != PyString_GET_SIZE(result):
raise RuntimeError('Number of bytes extracted did not match the'
' size encoded in the delta header.')
return result
def apply_delta_to_source(source, delta_start, delta_end):
"""Extract a delta from source bytes, and apply it."""
cdef char *c_source
cdef Py_ssize_t c_source_size
cdef char *c_delta
cdef Py_ssize_t c_delta_size
cdef Py_ssize_t c_delta_start, c_delta_end
if not PyString_CheckExact(source):
raise TypeError('source is not a str')
c_source_size = PyString_GET_SIZE(source)
c_delta_start = delta_start
c_delta_end = delta_end
if c_delta_start >= c_source_size:
raise ValueError('delta starts after source')
if c_delta_end > c_source_size:
raise ValueError('delta ends after source')
if c_delta_start >= c_delta_end:
raise ValueError('delta starts after it ends')
c_delta_size = c_delta_end - c_delta_start
c_source = PyString_AS_STRING(source)
c_delta = c_source + c_delta_start
# We don't use source_size, because we know the delta should not refer to
# any bytes after it starts
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
def encode_base128_int(val):
"""Convert an integer into a 7-bit lsb encoding."""
cdef unsigned int c_val
cdef Py_ssize_t count
cdef unsigned int num_bytes
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
c_val = val
count = 0
while c_val >= 0x80 and count < 8:
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
c_val = c_val >> 7
count = count + 1
if count >= 8 or c_val >= 0x80:
raise ValueError('encode_base128_int overflowed the buffer')
c_bytes[count] = <unsigned char>(c_val & 0xFF)
count = count + 1
return PyString_FromStringAndSize(<char *>c_bytes, count)
def decode_base128_int(bytes):
"""Decode an integer from a 7-bit lsb encoding."""
cdef int offset
cdef int val
cdef unsigned int uval
cdef int shift
cdef Py_ssize_t num_low_bytes
cdef unsigned char *c_bytes
offset = 0
val = 0
shift = 0
if not PyString_CheckExact(bytes):
raise TypeError('bytes is not a string')
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
# We take off 1, because we have to be able to decode the non-expanded byte
num_low_bytes = PyString_GET_SIZE(bytes) - 1
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
val = val | ((c_bytes[offset] & 0x7F) << shift)
shift = shift + 7
offset = offset + 1
if c_bytes[offset] & 0x80:
raise ValueError('Data not properly formatted, we ran out of'
' bytes before 0x80 stopped being set.')
val = val | (c_bytes[offset] << shift)
offset = offset + 1
if val < 0:
uval = <unsigned int> val
return uval, offset
return val, offset
|