1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef struct PyObject:
27
ctypedef int Py_ssize_t # Required for older pyrex versions
28
int PyString_CheckExact(object)
29
char * PyString_AS_STRING(object)
30
Py_ssize_t PyString_GET_SIZE(object)
31
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
ctypedef unsigned long size_t
36
void * malloc(size_t) nogil
37
void * realloc(void *, size_t) nogil
38
void free(void *) nogil
39
void memcpy(void *, void *, size_t) nogil
42
cdef extern from "delta.h":
46
unsigned long agg_offset
49
ctypedef enum delta_result:
57
delta_result create_delta_index(source_info *src,
59
delta_index **fresh) nogil
60
delta_result create_delta_index_from_delta(source_info *delta,
62
delta_index **fresh) nogil
63
void free_delta_index(delta_index *index) nogil
64
delta_result create_delta(delta_index *indexes,
65
void *buf, unsigned long bufsize,
66
unsigned long *delta_size,
67
unsigned long max_delta_size,
68
void **delta_data) nogil
69
unsigned long get_delta_hdr_size(unsigned char **datap,
70
unsigned char *top) nogil
71
unsigned long sizeof_delta_index(delta_index *index)
72
Py_ssize_t DELTA_SIZE_MIN
75
cdef void *safe_malloc(size_t count) except NULL:
77
result = malloc(count)
79
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
83
cdef void *safe_realloc(void * old, size_t count) except NULL:
85
result = realloc(old, count)
87
raise MemoryError('Failed to reallocate to %d bytes of memory'
92
cdef int safe_free(void **val) except -1:
98
def make_delta_index(source):
99
return DeltaIndex(source)
102
cdef object _translate_delta_failure(delta_result result):
103
if result == DELTA_OUT_OF_MEMORY:
104
return MemoryError("Delta function failed to allocate memory")
105
elif result == DELTA_INDEX_NEEDED:
106
return ValueError("Delta function requires delta_index param")
107
elif result == DELTA_SOURCE_EMPTY:
108
return ValueError("Delta function given empty source_info param")
109
elif result == DELTA_SOURCE_BAD:
110
return RuntimeError("Delta function given invalid source_info param")
111
elif result == DELTA_BUFFER_EMPTY:
112
return ValueError("Delta function given empty buffer params")
113
return AssertionError("Unrecognised delta result code: %d" % result)
116
cdef class DeltaIndex:
118
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
119
# isn't performance critical
120
# cdef readonly list _sources
121
cdef readonly object _sources
122
cdef source_info *_source_infos
123
cdef delta_index *_index
124
cdef public unsigned long _source_offset
125
cdef readonly unsigned int _max_num_sources
127
def __init__(self, source=None):
130
self._max_num_sources = 65000
131
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
132
* self._max_num_sources)
133
self._source_offset = 0
135
if source is not None:
136
self.add_source(source, 0)
138
def __sizeof__(self):
139
# We want to track the _source_infos allocations, but the referenced
140
# void* are actually tracked in _sources itself.
141
# XXX: Cython is capable of doing sizeof(class) and returning the size
142
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
143
# to do it manually. *sigh* Note that we might get it wrong
144
# because of alignment issues.
146
# PyObject start, vtable *, 3 object pointers, 2 C ints
147
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
148
+ sizeof(unsigned long)
149
+ sizeof(unsigned int))
150
+ (sizeof(source_info) * self._max_num_sources)
151
+ sizeof_delta_index(self._index))
155
return '%s(%d, %d)' % (self.__class__.__name__,
156
len(self._sources), self._source_offset)
158
def __dealloc__(self):
159
if self._index != NULL:
160
free_delta_index(self._index)
162
safe_free(<void **>&self._source_infos)
164
def _has_index(self):
165
return (self._index != NULL)
167
def add_delta_source(self, delta, unadded_bytes):
168
"""Add a new delta to the source texts.
170
:param delta: The text of the delta, this must be a byte string.
171
:param unadded_bytes: Number of bytes that were added to the source
172
that were not indexed.
175
cdef Py_ssize_t c_delta_size
176
cdef delta_index *index
177
cdef delta_result res
178
cdef unsigned int source_location
179
cdef source_info *src
180
cdef unsigned int num_indexes
182
if not PyString_CheckExact(delta):
183
raise TypeError('delta is not a str')
185
source_location = len(self._sources)
186
if source_location >= self._max_num_sources:
187
self._expand_sources()
188
self._sources.append(delta)
189
c_delta = PyString_AS_STRING(delta)
190
c_delta_size = PyString_GET_SIZE(delta)
191
src = self._source_infos + source_location
193
src.size = c_delta_size
194
src.agg_offset = self._source_offset + unadded_bytes
196
res = create_delta_index_from_delta(src, self._index, &index)
198
raise _translate_delta_failure(res)
199
self._source_offset = src.agg_offset + src.size
200
if index != self._index:
201
free_delta_index(self._index)
204
def add_source(self, source, unadded_bytes):
205
"""Add a new bit of source text to the delta indexes.
207
:param source: The text in question, this must be a byte string
208
:param unadded_bytes: Assume there are this many bytes that didn't get
209
added between this source and the end of the previous source.
212
cdef Py_ssize_t c_source_size
213
cdef delta_index *index
214
cdef delta_result res
215
cdef unsigned int source_location
216
cdef source_info *src
217
cdef unsigned int num_indexes
219
if not PyString_CheckExact(source):
220
raise TypeError('source is not a str')
222
source_location = len(self._sources)
223
if source_location >= self._max_num_sources:
224
self._expand_sources()
225
if source_location != 0 and self._index == NULL:
226
# We were lazy about populating the index, create it now
227
self._populate_first_index()
228
self._sources.append(source)
229
c_source = PyString_AS_STRING(source)
230
c_source_size = PyString_GET_SIZE(source)
231
src = self._source_infos + source_location
233
src.size = c_source_size
235
src.agg_offset = self._source_offset + unadded_bytes
236
self._source_offset = src.agg_offset + src.size
237
# We delay creating the index on the first insert
238
if source_location != 0:
240
res = create_delta_index(src, self._index, &index)
242
raise _translate_delta_failure(res)
243
if index != self._index:
244
free_delta_index(self._index)
247
cdef _populate_first_index(self):
248
cdef delta_index *index
249
cdef delta_result res
250
if len(self._sources) != 1 or self._index != NULL:
251
raise AssertionError('_populate_first_index should only be'
252
' called when we have a single source and no index yet')
254
# We know that self._index is already NULL, so create_delta_index
255
# will always create a new index unless there's a malloc failure
257
res = create_delta_index(&self._source_infos[0], NULL, &index)
259
raise _translate_delta_failure(res)
262
cdef _expand_sources(self):
263
raise RuntimeError('if we move self._source_infos, then we need to'
264
' change all of the index pointers as well.')
265
self._max_num_sources = self._max_num_sources * 2
266
self._source_infos = <source_info *>safe_realloc(self._source_infos,
268
* self._max_num_sources)
270
def make_delta(self, target_bytes, max_delta_size=0):
271
"""Create a delta from the current source to the target bytes."""
273
cdef Py_ssize_t target_size
275
cdef unsigned long delta_size
276
cdef unsigned long c_max_delta_size
277
cdef delta_result res
279
if self._index == NULL:
280
if len(self._sources) == 0:
282
# We were just lazy about generating the index
283
self._populate_first_index()
285
if not PyString_CheckExact(target_bytes):
286
raise TypeError('target is not a str')
288
target = PyString_AS_STRING(target_bytes)
289
target_size = PyString_GET_SIZE(target_bytes)
291
# TODO: inline some of create_delta so we at least don't have to double
292
# malloc, and can instead use PyString_FromStringAndSize, to
293
# allocate the bytes into the final string
294
c_max_delta_size = max_delta_size
296
res = create_delta(self._index, target, target_size,
297
&delta_size, c_max_delta_size, &delta)
300
result = PyString_FromStringAndSize(<char *>delta, delta_size)
302
elif res != DELTA_SIZE_TOO_BIG:
303
raise _translate_delta_failure(res)
307
def make_delta(source_bytes, target_bytes):
308
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
309
di = DeltaIndex(source_bytes)
310
return di.make_delta(target_bytes)
313
def apply_delta(source_bytes, delta_bytes):
314
"""Apply a delta generated by make_delta to source_bytes."""
316
cdef Py_ssize_t source_size
318
cdef Py_ssize_t delta_size
320
if not PyString_CheckExact(source_bytes):
321
raise TypeError('source is not a str')
322
if not PyString_CheckExact(delta_bytes):
323
raise TypeError('delta is not a str')
324
source = PyString_AS_STRING(source_bytes)
325
source_size = PyString_GET_SIZE(source_bytes)
326
delta = PyString_AS_STRING(delta_bytes)
327
delta_size = PyString_GET_SIZE(delta_bytes)
328
# Code taken from patch-delta.c, only brought here to give better error
329
# handling, and to avoid double allocating memory
330
if (delta_size < DELTA_SIZE_MIN):
331
# XXX: Invalid delta block
332
raise RuntimeError('delta_size %d smaller than min delta size %d'
333
% (delta_size, DELTA_SIZE_MIN))
335
return _apply_delta(source, source_size, delta, delta_size)
338
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
339
unsigned char cmd, unsigned int *offset,
340
unsigned int *length) nogil: # cannot_raise
341
"""Decode a copy instruction from the next few bytes.
343
A copy instruction is a variable number of bytes, so we will parse the
344
bytes we care about, and return the new position, as well as the offset and
345
length referred to in the bytes.
347
:param bytes: Pointer to the start of bytes after cmd
348
:param cmd: The command code
349
:return: Pointer to the bytes just after the last decode byte
351
cdef unsigned int off, size, count
359
off = off | (bytes[count] << 8)
362
off = off | (bytes[count] << 16)
365
off = off | (bytes[count] << 24)
371
size = size | (bytes[count] << 8)
374
size = size | (bytes[count] << 16)
383
cdef object _apply_delta(char *source, Py_ssize_t source_size,
384
char *delta, Py_ssize_t delta_size):
385
"""common functionality between apply_delta and apply_delta_to_source."""
386
cdef unsigned char *data, *top
387
cdef unsigned char *dst_buf, *out, cmd
389
cdef unsigned int cp_off, cp_size
392
data = <unsigned char *>delta
393
top = data + delta_size
395
# now the result size
396
size = get_delta_hdr_size(&data, top)
397
result = PyString_FromStringAndSize(NULL, size)
398
dst_buf = <unsigned char*>PyString_AS_STRING(result)
408
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
409
if (cp_off + cp_size < cp_size or
410
cp_off + cp_size > <unsigned int>source_size or
411
cp_size > <unsigned int>size):
414
memcpy(out, source + cp_off, cp_size)
416
size = size - cp_size
420
# cmd == 0 is reserved for future encoding
421
# extensions. In the mean time we must fail when
422
# encountering them (might be data corruption).
428
memcpy(out, data, cmd)
434
raise ValueError('Something wrong with:'
435
' cp_off = %s, cp_size = %s'
436
' source_size = %s, size = %s'
437
% (cp_off, cp_size, source_size, size))
439
raise ValueError('Got delta opcode: 0, not supported')
441
raise ValueError('Insert instruction longer than remaining'
442
' bytes: %d > %d' % (cmd, size))
445
if (data != top or size != 0):
446
raise RuntimeError('Did not extract the number of bytes we expected'
447
' we were left with %d bytes in "size", and top - data = %d'
448
% (size, <int>(top - data)))
451
# *dst_size = out - dst_buf;
452
if (out - dst_buf) != PyString_GET_SIZE(result):
453
raise RuntimeError('Number of bytes extracted did not match the'
454
' size encoded in the delta header.')
458
def apply_delta_to_source(source, delta_start, delta_end):
459
"""Extract a delta from source bytes, and apply it."""
461
cdef Py_ssize_t c_source_size
463
cdef Py_ssize_t c_delta_size
464
cdef Py_ssize_t c_delta_start, c_delta_end
466
if not PyString_CheckExact(source):
467
raise TypeError('source is not a str')
468
c_source_size = PyString_GET_SIZE(source)
469
c_delta_start = delta_start
470
c_delta_end = delta_end
471
if c_delta_start >= c_source_size:
472
raise ValueError('delta starts after source')
473
if c_delta_end > c_source_size:
474
raise ValueError('delta ends after source')
475
if c_delta_start >= c_delta_end:
476
raise ValueError('delta starts after it ends')
478
c_delta_size = c_delta_end - c_delta_start
479
c_source = PyString_AS_STRING(source)
480
c_delta = c_source + c_delta_start
481
# We don't use source_size, because we know the delta should not refer to
482
# any bytes after it starts
483
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
486
def encode_base128_int(val):
487
"""Convert an integer into a 7-bit lsb encoding."""
488
cdef unsigned int c_val
489
cdef Py_ssize_t count
490
cdef unsigned int num_bytes
491
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
495
while c_val >= 0x80 and count < 8:
496
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
499
if count >= 8 or c_val >= 0x80:
500
raise ValueError('encode_base128_int overflowed the buffer')
501
c_bytes[count] = <unsigned char>(c_val & 0xFF)
503
return PyString_FromStringAndSize(<char *>c_bytes, count)
506
def decode_base128_int(bytes):
507
"""Decode an integer from a 7-bit lsb encoding."""
510
cdef unsigned int uval
512
cdef Py_ssize_t num_low_bytes
513
cdef unsigned char *c_bytes
518
if not PyString_CheckExact(bytes):
519
raise TypeError('bytes is not a string')
520
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
521
# We take off 1, because we have to be able to decode the non-expanded byte
522
num_low_bytes = PyString_GET_SIZE(bytes) - 1
523
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
524
val = val | ((c_bytes[offset] & 0x7F) << shift)
527
if c_bytes[offset] & 0x80:
528
raise ValueError('Data not properly formatted, we ran out of'
529
' bytes before 0x80 stopped being set.')
530
val = val | (c_bytes[offset] << shift)
533
uval = <unsigned int> val