1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef struct PyObject:
27
ctypedef int Py_ssize_t # Required for older pyrex versions
28
int PyString_CheckExact(object)
29
char * PyString_AS_STRING(object)
30
Py_ssize_t PyString_GET_SIZE(object)
31
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
ctypedef unsigned long size_t
36
void * malloc(size_t) nogil
37
void * realloc(void *, size_t) nogil
38
void free(void *) nogil
39
void memcpy(void *, void *, size_t) nogil
42
cdef extern from "delta.h":
46
unsigned long agg_offset
49
delta_index * create_delta_index(source_info *src, delta_index *old) nogil
50
delta_index * create_delta_index_from_delta(source_info *delta,
51
delta_index *old) nogil
52
void free_delta_index(delta_index *index) nogil
53
void *create_delta(delta_index *indexes,
54
void *buf, unsigned long bufsize,
55
unsigned long *delta_size, unsigned long max_delta_size) nogil
56
unsigned long get_delta_hdr_size(unsigned char **datap,
57
unsigned char *top) nogil
58
unsigned long sizeof_delta_index(delta_index *index)
59
Py_ssize_t DELTA_SIZE_MIN
62
cdef void *safe_malloc(size_t count) except NULL:
64
result = malloc(count)
66
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
72
result = realloc(old, count)
74
raise MemoryError('Failed to reallocate to %d bytes of memory'
79
cdef int safe_free(void **val) except -1:
85
def make_delta_index(source):
86
return DeltaIndex(source)
89
cdef class DeltaIndex:
91
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
92
# isn't performance critical
93
# cdef readonly list _sources
94
cdef readonly object _sources
95
cdef source_info *_source_infos
96
cdef delta_index *_index
97
cdef public unsigned long _source_offset
98
cdef readonly unsigned int _max_num_sources
100
def __init__(self, source=None):
103
self._max_num_sources = 65000
104
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
105
* self._max_num_sources)
106
self._source_offset = 0
108
if source is not None:
109
self.add_source(source, 0)
111
def __sizeof__(self):
112
# We want to track the _source_infos allocations, but the referenced
113
# void* are actually tracked in _sources itself.
114
# XXX: Cython is capable of doing sizeof(class) and returning the size
115
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
116
# to do it manually. *sigh* Note that we might get it wrong
117
# because of alignment issues.
119
# PyObject start, vtable *, 3 object pointers, 2 C ints
120
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
121
+ sizeof(unsigned long)
122
+ sizeof(unsigned int))
123
+ (sizeof(source_info) * self._max_num_sources)
124
+ sizeof_delta_index(self._index))
128
return '%s(%d, %d)' % (self.__class__.__name__,
129
len(self._sources), self._source_offset)
131
def __dealloc__(self):
132
if self._index != NULL:
133
free_delta_index(self._index)
135
safe_free(<void **>&self._source_infos)
137
def _has_index(self):
138
return (self._index != NULL)
140
def add_delta_source(self, delta, unadded_bytes):
141
"""Add a new delta to the source texts.
143
:param delta: The text of the delta, this must be a byte string.
144
:param unadded_bytes: Number of bytes that were added to the source
145
that were not indexed.
148
cdef Py_ssize_t c_delta_size
149
cdef delta_index *index
150
cdef unsigned int source_location
151
cdef source_info *src
152
cdef unsigned int num_indexes
154
if not PyString_CheckExact(delta):
155
raise TypeError('delta is not a str')
157
source_location = len(self._sources)
158
if source_location >= self._max_num_sources:
159
self._expand_sources()
160
self._sources.append(delta)
161
c_delta = PyString_AS_STRING(delta)
162
c_delta_size = PyString_GET_SIZE(delta)
163
src = self._source_infos + source_location
165
src.size = c_delta_size
166
src.agg_offset = self._source_offset + unadded_bytes
168
index = create_delta_index_from_delta(src, self._index)
169
self._source_offset = src.agg_offset + src.size
171
free_delta_index(self._index)
174
def add_source(self, source, unadded_bytes):
175
"""Add a new bit of source text to the delta indexes.
177
:param source: The text in question, this must be a byte string
178
:param unadded_bytes: Assume there are this many bytes that didn't get
179
added between this source and the end of the previous source.
182
cdef Py_ssize_t c_source_size
183
cdef delta_index *index
184
cdef unsigned int source_location
185
cdef source_info *src
186
cdef unsigned int num_indexes
188
if not PyString_CheckExact(source):
189
raise TypeError('source is not a str')
191
source_location = len(self._sources)
192
if source_location >= self._max_num_sources:
193
self._expand_sources()
194
if source_location != 0 and self._index == NULL:
195
# We were lazy about populating the index, create it now
196
self._populate_first_index()
197
self._sources.append(source)
198
c_source = PyString_AS_STRING(source)
199
c_source_size = PyString_GET_SIZE(source)
200
src = self._source_infos + source_location
202
src.size = c_source_size
204
src.agg_offset = self._source_offset + unadded_bytes
205
self._source_offset = src.agg_offset + src.size
206
# We delay creating the index on the first insert
207
if source_location != 0:
209
index = create_delta_index(src, self._index)
211
free_delta_index(self._index)
214
cdef _populate_first_index(self):
215
cdef delta_index *index
216
if len(self._sources) != 1 or self._index != NULL:
217
raise AssertionError('_populate_first_index should only be'
218
' called when we have a single source and no index yet')
220
# We know that self._index is already NULL, so whatever
221
# create_delta_index returns is fine
223
self._index = create_delta_index(&self._source_infos[0], NULL)
224
assert self._index != NULL
226
cdef _expand_sources(self):
227
raise RuntimeError('if we move self._source_infos, then we need to'
228
' change all of the index pointers as well.')
229
self._max_num_sources = self._max_num_sources * 2
230
self._source_infos = <source_info *>safe_realloc(self._source_infos,
232
* self._max_num_sources)
234
def make_delta(self, target_bytes, max_delta_size=0):
235
"""Create a delta from the current source to the target bytes."""
237
cdef Py_ssize_t target_size
239
cdef unsigned long delta_size
240
cdef unsigned long c_max_delta_size
242
if self._index == NULL:
243
if len(self._sources) == 0:
245
# We were just lazy about generating the index
246
self._populate_first_index()
248
if not PyString_CheckExact(target_bytes):
249
raise TypeError('target is not a str')
251
target = PyString_AS_STRING(target_bytes)
252
target_size = PyString_GET_SIZE(target_bytes)
254
# TODO: inline some of create_delta so we at least don't have to double
255
# malloc, and can instead use PyString_FromStringAndSize, to
256
# allocate the bytes into the final string
257
c_max_delta_size = max_delta_size
259
delta = create_delta(self._index,
261
&delta_size, c_max_delta_size)
264
result = PyString_FromStringAndSize(<char *>delta, delta_size)
269
def make_delta(source_bytes, target_bytes):
270
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
271
di = DeltaIndex(source_bytes)
272
return di.make_delta(target_bytes)
275
def apply_delta(source_bytes, delta_bytes):
276
"""Apply a delta generated by make_delta to source_bytes."""
278
cdef Py_ssize_t source_size
280
cdef Py_ssize_t delta_size
282
if not PyString_CheckExact(source_bytes):
283
raise TypeError('source is not a str')
284
if not PyString_CheckExact(delta_bytes):
285
raise TypeError('delta is not a str')
286
source = PyString_AS_STRING(source_bytes)
287
source_size = PyString_GET_SIZE(source_bytes)
288
delta = PyString_AS_STRING(delta_bytes)
289
delta_size = PyString_GET_SIZE(delta_bytes)
290
# Code taken from patch-delta.c, only brought here to give better error
291
# handling, and to avoid double allocating memory
292
if (delta_size < DELTA_SIZE_MIN):
293
# XXX: Invalid delta block
294
raise RuntimeError('delta_size %d smaller than min delta size %d'
295
% (delta_size, DELTA_SIZE_MIN))
297
return _apply_delta(source, source_size, delta, delta_size)
300
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
301
unsigned char cmd, unsigned int *offset,
302
unsigned int *length) nogil: # cannot_raise
303
"""Decode a copy instruction from the next few bytes.
305
A copy instruction is a variable number of bytes, so we will parse the
306
bytes we care about, and return the new position, as well as the offset and
307
length referred to in the bytes.
309
:param bytes: Pointer to the start of bytes after cmd
310
:param cmd: The command code
311
:return: Pointer to the bytes just after the last decode byte
313
cdef unsigned int off, size, count
321
off = off | (bytes[count] << 8)
324
off = off | (bytes[count] << 16)
327
off = off | (bytes[count] << 24)
333
size = size | (bytes[count] << 8)
336
size = size | (bytes[count] << 16)
345
cdef object _apply_delta(char *source, Py_ssize_t source_size,
346
char *delta, Py_ssize_t delta_size):
347
"""common functionality between apply_delta and apply_delta_to_source."""
348
cdef unsigned char *data, *top
349
cdef unsigned char *dst_buf, *out, cmd
351
cdef unsigned int cp_off, cp_size
354
data = <unsigned char *>delta
355
top = data + delta_size
357
# now the result size
358
size = get_delta_hdr_size(&data, top)
359
result = PyString_FromStringAndSize(NULL, size)
360
dst_buf = <unsigned char*>PyString_AS_STRING(result)
370
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
371
if (cp_off + cp_size < cp_size or
372
cp_off + cp_size > source_size or
376
memcpy(out, source + cp_off, cp_size)
378
size = size - cp_size
382
# cmd == 0 is reserved for future encoding
383
# extensions. In the mean time we must fail when
384
# encountering them (might be data corruption).
390
memcpy(out, data, cmd)
396
raise ValueError('Something wrong with:'
397
' cp_off = %s, cp_size = %s'
398
' source_size = %s, size = %s'
399
% (cp_off, cp_size, source_size, size))
401
raise ValueError('Got delta opcode: 0, not supported')
403
raise ValueError('Insert instruction longer than remaining'
404
' bytes: %d > %d' % (cmd, size))
407
if (data != top or size != 0):
408
raise RuntimeError('Did not extract the number of bytes we expected'
409
' we were left with %d bytes in "size", and top - data = %d'
410
% (size, <int>(top - data)))
413
# *dst_size = out - dst_buf;
414
if (out - dst_buf) != PyString_GET_SIZE(result):
415
raise RuntimeError('Number of bytes extracted did not match the'
416
' size encoded in the delta header.')
420
def apply_delta_to_source(source, delta_start, delta_end):
421
"""Extract a delta from source bytes, and apply it."""
423
cdef Py_ssize_t c_source_size
425
cdef Py_ssize_t c_delta_size
426
cdef Py_ssize_t c_delta_start, c_delta_end
428
if not PyString_CheckExact(source):
429
raise TypeError('source is not a str')
430
c_source_size = PyString_GET_SIZE(source)
431
c_delta_start = delta_start
432
c_delta_end = delta_end
433
if c_delta_start >= c_source_size:
434
raise ValueError('delta starts after source')
435
if c_delta_end > c_source_size:
436
raise ValueError('delta ends after source')
437
if c_delta_start >= c_delta_end:
438
raise ValueError('delta starts after it ends')
440
c_delta_size = c_delta_end - c_delta_start
441
c_source = PyString_AS_STRING(source)
442
c_delta = c_source + c_delta_start
443
# We don't use source_size, because we know the delta should not refer to
444
# any bytes after it starts
445
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
448
def encode_base128_int(val):
449
"""Convert an integer into a 7-bit lsb encoding."""
450
cdef unsigned int c_val
451
cdef Py_ssize_t count
452
cdef unsigned int num_bytes
453
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
457
while c_val >= 0x80 and count < 8:
458
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
461
if count >= 8 or c_val >= 0x80:
462
raise ValueError('encode_base128_int overflowed the buffer')
463
c_bytes[count] = <unsigned char>(c_val & 0xFF)
465
return PyString_FromStringAndSize(<char *>c_bytes, count)
468
def decode_base128_int(bytes):
469
"""Decode an integer from a 7-bit lsb encoding."""
472
cdef unsigned int uval
474
cdef Py_ssize_t num_low_bytes
475
cdef unsigned char *c_bytes
480
if not PyString_CheckExact(bytes):
481
raise TypeError('bytes is not a string')
482
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
483
# We take off 1, because we have to be able to decode the non-expanded byte
484
num_low_bytes = PyString_GET_SIZE(bytes) - 1
485
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
486
val = val | ((c_bytes[offset] & 0x7F) << shift)
489
if c_bytes[offset] & 0x80:
490
raise ValueError('Data not properly formatted, we ran out of'
491
' bytes before 0x80 stopped being set.')
492
val = val | (c_bytes[offset] << shift)
495
uval = <unsigned int> val