1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef struct PyObject:
27
ctypedef int Py_ssize_t # Required for older pyrex versions
28
int PyString_CheckExact(object)
29
char * PyString_AS_STRING(object)
30
Py_ssize_t PyString_GET_SIZE(object)
31
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
ctypedef unsigned long size_t
36
void * malloc(size_t) nogil
37
void * realloc(void *, size_t) nogil
38
void free(void *) nogil
39
void memcpy(void *, void *, size_t) nogil
42
cdef extern from "delta.h":
46
unsigned long agg_offset
49
ctypedef enum delta_result:
57
delta_result create_delta_index(source_info *src,
60
int max_entries) nogil
61
delta_result create_delta_index_from_delta(source_info *delta,
63
delta_index **fresh) nogil
64
void free_delta_index(delta_index *index) nogil
65
delta_result create_delta(delta_index *indexes,
66
void *buf, unsigned long bufsize,
67
unsigned long *delta_size,
68
unsigned long max_delta_size,
69
void **delta_data) nogil
70
unsigned long get_delta_hdr_size(unsigned char **datap,
71
unsigned char *top) nogil
72
unsigned long sizeof_delta_index(delta_index *index)
73
Py_ssize_t DELTA_SIZE_MIN
74
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
75
int get_entry_summary(delta_index *index, int pos,
76
unsigned int *global_offset, unsigned int *hash_val)
77
unsigned int rabin_hash (unsigned char *data)
80
cdef void *safe_malloc(size_t count) except NULL:
82
result = malloc(count)
84
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
88
cdef void *safe_realloc(void * old, size_t count) except NULL:
90
result = realloc(old, count)
92
raise MemoryError('Failed to reallocate to %d bytes of memory'
97
cdef int safe_free(void **val) except -1:
103
def make_delta_index(source):
104
return DeltaIndex(source)
107
cdef object _translate_delta_failure(delta_result result):
108
if result == DELTA_OUT_OF_MEMORY:
109
return MemoryError("Delta function failed to allocate memory")
110
elif result == DELTA_INDEX_NEEDED:
111
return ValueError("Delta function requires delta_index param")
112
elif result == DELTA_SOURCE_EMPTY:
113
return ValueError("Delta function given empty source_info param")
114
elif result == DELTA_SOURCE_BAD:
115
return RuntimeError("Delta function given invalid source_info param")
116
elif result == DELTA_BUFFER_EMPTY:
117
return ValueError("Delta function given empty buffer params")
118
return AssertionError("Unrecognised delta result code: %d" % result)
121
def _rabin_hash(content):
122
if not PyString_CheckExact(content):
123
raise ValueError('content must be a string')
124
if len(content) < 16:
125
raise ValueError('content must be at least 16 bytes long')
126
# Try to cast it to an int, if it can fit
127
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
130
cdef class DeltaIndex:
132
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
133
# isn't performance critical
134
# cdef readonly list _sources
135
cdef readonly object _sources
136
cdef source_info *_source_infos
137
cdef delta_index *_index
138
cdef public unsigned long _source_offset
139
cdef readonly unsigned int _max_num_sources
140
cdef public int _max_bytes_to_index
142
def __init__(self, source=None, max_bytes_to_index=None):
145
self._max_num_sources = 65000
146
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
147
* self._max_num_sources)
148
self._source_offset = 0
149
self._max_bytes_to_index = 0
150
if max_bytes_to_index is not None:
151
self._max_bytes_to_index = max_bytes_to_index
153
if source is not None:
154
self.add_source(source, 0)
156
def __sizeof__(self):
157
# We want to track the _source_infos allocations, but the referenced
158
# void* are actually tracked in _sources itself.
159
# XXX: Cython is capable of doing sizeof(class) and returning the size
160
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
161
# to do it manually. *sigh* Note that we might get it wrong
162
# because of alignment issues.
164
# PyObject start, vtable *, 3 object pointers, 2 C ints
165
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
166
+ sizeof(unsigned long)
167
+ sizeof(unsigned int))
168
+ (sizeof(source_info) * self._max_num_sources)
169
+ sizeof_delta_index(self._index))
173
return '%s(%d, %d)' % (self.__class__.__name__,
174
len(self._sources), self._source_offset)
176
def __dealloc__(self):
177
if self._index != NULL:
178
free_delta_index(self._index)
180
safe_free(<void **>&self._source_infos)
182
def _has_index(self):
183
return (self._index != NULL)
185
def _dump_index(self):
186
"""Dump the pointers in the index.
188
This is an arbitrary layout, used for testing. It is not meant to be
189
used in production code.
191
:return: (hash_list, entry_list)
192
hash_list A list of offsets, so hash[i] points to the 'hash
193
bucket' starting at the given offset and going until
195
entry_list A list of (text_offset, hash_val). text_offset is the
196
offset in the "source" texts, and hash_val is the RABIN
197
hash for that offset.
198
Note that the entry should be in the hash bucket
200
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
203
cdef unsigned int text_offset
204
cdef unsigned int hash_val
205
cdef unsigned int hash_offset
206
if self._index == NULL:
210
while get_hash_offset(self._index, pos, &hash_offset):
211
hash_list.append(int(hash_offset))
215
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
216
# Map back using 'int' so that we don't get Long everywhere, when
217
# almost everything is <2**31.
218
val = tuple(map(int, [text_offset, hash_val]))
219
entry_list.append(val)
221
return hash_list, entry_list
223
def add_delta_source(self, delta, unadded_bytes):
224
"""Add a new delta to the source texts.
226
:param delta: The text of the delta, this must be a byte string.
227
:param unadded_bytes: Number of bytes that were added to the source
228
that were not indexed.
231
cdef Py_ssize_t c_delta_size
232
cdef delta_index *index
233
cdef delta_result res
234
cdef unsigned int source_location
235
cdef source_info *src
236
cdef unsigned int num_indexes
238
if not PyString_CheckExact(delta):
239
raise TypeError('delta is not a str')
241
source_location = len(self._sources)
242
if source_location >= self._max_num_sources:
243
self._expand_sources()
244
self._sources.append(delta)
245
c_delta = PyString_AS_STRING(delta)
246
c_delta_size = PyString_GET_SIZE(delta)
247
src = self._source_infos + source_location
249
src.size = c_delta_size
250
src.agg_offset = self._source_offset + unadded_bytes
252
res = create_delta_index_from_delta(src, self._index, &index)
254
raise _translate_delta_failure(res)
255
self._source_offset = src.agg_offset + src.size
256
if index != self._index:
257
free_delta_index(self._index)
260
def add_source(self, source, unadded_bytes):
261
"""Add a new bit of source text to the delta indexes.
263
:param source: The text in question, this must be a byte string
264
:param unadded_bytes: Assume there are this many bytes that didn't get
265
added between this source and the end of the previous source.
266
:param max_pointers: Add no more than this many entries to the index.
267
By default, we sample every 16 bytes, if that would require more
268
than max_entries, we will reduce the sampling rate.
269
A value of 0 means unlimited, None means use the default limit.
272
cdef Py_ssize_t c_source_size
273
cdef delta_index *index
274
cdef delta_result res
275
cdef unsigned int source_location
276
cdef source_info *src
277
cdef unsigned int num_indexes
278
cdef int max_num_entries
280
if not PyString_CheckExact(source):
281
raise TypeError('source is not a str')
283
source_location = len(self._sources)
284
if source_location >= self._max_num_sources:
285
self._expand_sources()
286
if source_location != 0 and self._index == NULL:
287
# We were lazy about populating the index, create it now
288
self._populate_first_index()
289
self._sources.append(source)
290
c_source = PyString_AS_STRING(source)
291
c_source_size = PyString_GET_SIZE(source)
292
src = self._source_infos + source_location
294
src.size = c_source_size
296
src.agg_offset = self._source_offset + unadded_bytes
297
self._source_offset = src.agg_offset + src.size
298
# We delay creating the index on the first insert
299
if source_location != 0:
301
res = create_delta_index(src, self._index, &index,
302
self._max_bytes_to_index)
304
raise _translate_delta_failure(res)
305
if index != self._index:
306
free_delta_index(self._index)
309
cdef _populate_first_index(self):
310
cdef delta_index *index
311
cdef delta_result res
312
if len(self._sources) != 1 or self._index != NULL:
313
raise AssertionError('_populate_first_index should only be'
314
' called when we have a single source and no index yet')
316
# We know that self._index is already NULL, so create_delta_index
317
# will always create a new index unless there's a malloc failure
319
res = create_delta_index(&self._source_infos[0], NULL, &index,
320
self._max_bytes_to_index)
322
raise _translate_delta_failure(res)
325
cdef _expand_sources(self):
326
raise RuntimeError('if we move self._source_infos, then we need to'
327
' change all of the index pointers as well.')
328
self._max_num_sources = self._max_num_sources * 2
329
self._source_infos = <source_info *>safe_realloc(self._source_infos,
331
* self._max_num_sources)
333
def make_delta(self, target_bytes, max_delta_size=0):
334
"""Create a delta from the current source to the target bytes."""
336
cdef Py_ssize_t target_size
338
cdef unsigned long delta_size
339
cdef unsigned long c_max_delta_size
340
cdef delta_result res
342
if self._index == NULL:
343
if len(self._sources) == 0:
345
# We were just lazy about generating the index
346
self._populate_first_index()
348
if not PyString_CheckExact(target_bytes):
349
raise TypeError('target is not a str')
351
target = PyString_AS_STRING(target_bytes)
352
target_size = PyString_GET_SIZE(target_bytes)
354
# TODO: inline some of create_delta so we at least don't have to double
355
# malloc, and can instead use PyString_FromStringAndSize, to
356
# allocate the bytes into the final string
357
c_max_delta_size = max_delta_size
359
res = create_delta(self._index, target, target_size,
360
&delta_size, c_max_delta_size, &delta)
363
result = PyString_FromStringAndSize(<char *>delta, delta_size)
365
elif res != DELTA_SIZE_TOO_BIG:
366
raise _translate_delta_failure(res)
370
def make_delta(source_bytes, target_bytes):
371
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
372
di = DeltaIndex(source_bytes)
373
return di.make_delta(target_bytes)
376
def apply_delta(source_bytes, delta_bytes):
377
"""Apply a delta generated by make_delta to source_bytes."""
379
cdef Py_ssize_t source_size
381
cdef Py_ssize_t delta_size
383
if not PyString_CheckExact(source_bytes):
384
raise TypeError('source is not a str')
385
if not PyString_CheckExact(delta_bytes):
386
raise TypeError('delta is not a str')
387
source = PyString_AS_STRING(source_bytes)
388
source_size = PyString_GET_SIZE(source_bytes)
389
delta = PyString_AS_STRING(delta_bytes)
390
delta_size = PyString_GET_SIZE(delta_bytes)
391
# Code taken from patch-delta.c, only brought here to give better error
392
# handling, and to avoid double allocating memory
393
if (delta_size < DELTA_SIZE_MIN):
394
# XXX: Invalid delta block
395
raise RuntimeError('delta_size %d smaller than min delta size %d'
396
% (delta_size, DELTA_SIZE_MIN))
398
return _apply_delta(source, source_size, delta, delta_size)
401
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
402
unsigned char cmd, unsigned int *offset,
403
unsigned int *length) nogil: # cannot_raise
404
"""Decode a copy instruction from the next few bytes.
406
A copy instruction is a variable number of bytes, so we will parse the
407
bytes we care about, and return the new position, as well as the offset and
408
length referred to in the bytes.
410
:param bytes: Pointer to the start of bytes after cmd
411
:param cmd: The command code
412
:return: Pointer to the bytes just after the last decode byte
414
cdef unsigned int off, size, count
422
off = off | (bytes[count] << 8)
425
off = off | (bytes[count] << 16)
428
off = off | (bytes[count] << 24)
434
size = size | (bytes[count] << 8)
437
size = size | (bytes[count] << 16)
446
cdef object _apply_delta(char *source, Py_ssize_t source_size,
447
char *delta, Py_ssize_t delta_size):
448
"""common functionality between apply_delta and apply_delta_to_source."""
449
cdef unsigned char *data, *top
450
cdef unsigned char *dst_buf, *out, cmd
452
cdef unsigned int cp_off, cp_size
455
data = <unsigned char *>delta
456
top = data + delta_size
458
# now the result size
459
size = get_delta_hdr_size(&data, top)
460
result = PyString_FromStringAndSize(NULL, size)
461
dst_buf = <unsigned char*>PyString_AS_STRING(result)
471
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
472
if (cp_off + cp_size < cp_size or
473
cp_off + cp_size > <unsigned int>source_size or
474
cp_size > <unsigned int>size):
477
memcpy(out, source + cp_off, cp_size)
479
size = size - cp_size
483
# cmd == 0 is reserved for future encoding
484
# extensions. In the mean time we must fail when
485
# encountering them (might be data corruption).
491
memcpy(out, data, cmd)
497
raise ValueError('Something wrong with:'
498
' cp_off = %s, cp_size = %s'
499
' source_size = %s, size = %s'
500
% (cp_off, cp_size, source_size, size))
502
raise ValueError('Got delta opcode: 0, not supported')
504
raise ValueError('Insert instruction longer than remaining'
505
' bytes: %d > %d' % (cmd, size))
508
if (data != top or size != 0):
509
raise RuntimeError('Did not extract the number of bytes we expected'
510
' we were left with %d bytes in "size", and top - data = %d'
511
% (size, <int>(top - data)))
514
# *dst_size = out - dst_buf;
515
if (out - dst_buf) != PyString_GET_SIZE(result):
516
raise RuntimeError('Number of bytes extracted did not match the'
517
' size encoded in the delta header.')
521
def apply_delta_to_source(source, delta_start, delta_end):
522
"""Extract a delta from source bytes, and apply it."""
524
cdef Py_ssize_t c_source_size
526
cdef Py_ssize_t c_delta_size
527
cdef Py_ssize_t c_delta_start, c_delta_end
529
if not PyString_CheckExact(source):
530
raise TypeError('source is not a str')
531
c_source_size = PyString_GET_SIZE(source)
532
c_delta_start = delta_start
533
c_delta_end = delta_end
534
if c_delta_start >= c_source_size:
535
raise ValueError('delta starts after source')
536
if c_delta_end > c_source_size:
537
raise ValueError('delta ends after source')
538
if c_delta_start >= c_delta_end:
539
raise ValueError('delta starts after it ends')
541
c_delta_size = c_delta_end - c_delta_start
542
c_source = PyString_AS_STRING(source)
543
c_delta = c_source + c_delta_start
544
# We don't use source_size, because we know the delta should not refer to
545
# any bytes after it starts
546
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
549
def encode_base128_int(val):
550
"""Convert an integer into a 7-bit lsb encoding."""
551
cdef unsigned int c_val
552
cdef Py_ssize_t count
553
cdef unsigned int num_bytes
554
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
558
while c_val >= 0x80 and count < 8:
559
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
562
if count >= 8 or c_val >= 0x80:
563
raise ValueError('encode_base128_int overflowed the buffer')
564
c_bytes[count] = <unsigned char>(c_val & 0xFF)
566
return PyString_FromStringAndSize(<char *>c_bytes, count)
569
def decode_base128_int(bytes):
570
"""Decode an integer from a 7-bit lsb encoding."""
573
cdef unsigned int uval
575
cdef Py_ssize_t num_low_bytes
576
cdef unsigned char *c_bytes
581
if not PyString_CheckExact(bytes):
582
raise TypeError('bytes is not a string')
583
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
584
# We take off 1, because we have to be able to decode the non-expanded byte
585
num_low_bytes = PyString_GET_SIZE(bytes) - 1
586
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
587
val = val | ((c_bytes[offset] & 0x7F) << shift)
590
if c_bytes[offset] & 0x80:
591
raise ValueError('Data not properly formatted, we ran out of'
592
' bytes before 0x80 stopped being set.')
593
val = val | (c_bytes[offset] << shift)
596
uval = <unsigned int> val