1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef int Py_ssize_t # Required for older pyrex versions
26
int PyString_CheckExact(object)
27
char * PyString_AS_STRING(object)
28
Py_ssize_t PyString_GET_SIZE(object)
29
object PyString_FromStringAndSize(char *, Py_ssize_t)
33
ctypedef unsigned long size_t
34
void * malloc(size_t) nogil
35
void * realloc(void *, size_t) nogil
36
void free(void *) nogil
37
void memcpy(void *, void *, size_t) nogil
40
cdef extern from "delta.h":
44
unsigned long agg_offset
47
delta_index * create_delta_index(source_info *src, delta_index *old) nogil
48
delta_index * create_delta_index_from_delta(source_info *delta,
49
delta_index *old) nogil
50
void free_delta_index(delta_index *index) nogil
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size) nogil
54
unsigned long get_delta_hdr_size(unsigned char **datap,
55
unsigned char *top) nogil
56
Py_ssize_t DELTA_SIZE_MIN
59
cdef void *safe_malloc(size_t count) except NULL:
61
result = malloc(count)
63
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
cdef void *safe_realloc(void * old, size_t count) except NULL:
69
result = realloc(old, count)
71
raise MemoryError('Failed to reallocate to %d bytes of memory'
76
cdef int safe_free(void **val) except -1:
82
def make_delta_index(source):
83
return DeltaIndex(source)
86
cdef class DeltaIndex:
88
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
89
# isn't performance critical
90
# cdef readonly list _sources
91
cdef readonly object _sources
92
cdef source_info *_source_infos
93
cdef delta_index *_index
94
cdef readonly unsigned int _max_num_sources
95
cdef public unsigned long _source_offset
97
def __init__(self, source=None):
100
self._max_num_sources = 65000
101
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
102
* self._max_num_sources)
103
self._source_offset = 0
105
if source is not None:
106
self.add_source(source, 0)
109
return '%s(%d, %d)' % (self.__class__.__name__,
110
len(self._sources), self._source_offset)
112
def __dealloc__(self):
113
if self._index != NULL:
114
free_delta_index(self._index)
116
safe_free(<void **>&self._source_infos)
118
def _has_index(self):
119
return (self._index != NULL)
121
def add_delta_source(self, delta, unadded_bytes):
122
"""Add a new delta to the source texts.
124
:param delta: The text of the delta, this must be a byte string.
125
:param unadded_bytes: Number of bytes that were added to the source
126
that were not indexed.
129
cdef Py_ssize_t c_delta_size
130
cdef delta_index *index
131
cdef unsigned int source_location
132
cdef source_info *src
133
cdef unsigned int num_indexes
135
if not PyString_CheckExact(delta):
136
raise TypeError('delta is not a str')
138
source_location = len(self._sources)
139
if source_location >= self._max_num_sources:
140
self._expand_sources()
141
self._sources.append(delta)
142
c_delta = PyString_AS_STRING(delta)
143
c_delta_size = PyString_GET_SIZE(delta)
144
src = self._source_infos + source_location
146
src.size = c_delta_size
147
src.agg_offset = self._source_offset + unadded_bytes
149
index = create_delta_index_from_delta(src, self._index)
150
self._source_offset = src.agg_offset + src.size
152
free_delta_index(self._index)
155
def add_source(self, source, unadded_bytes):
156
"""Add a new bit of source text to the delta indexes.
158
:param source: The text in question, this must be a byte string
159
:param unadded_bytes: Assume there are this many bytes that didn't get
160
added between this source and the end of the previous source.
163
cdef Py_ssize_t c_source_size
164
cdef delta_index *index
165
cdef unsigned int source_location
166
cdef source_info *src
167
cdef unsigned int num_indexes
169
if not PyString_CheckExact(source):
170
raise TypeError('source is not a str')
172
source_location = len(self._sources)
173
if source_location >= self._max_num_sources:
174
self._expand_sources()
175
if source_location != 0 and self._index == NULL:
176
# We were lazy about populating the index, create it now
177
self._populate_first_index()
178
self._sources.append(source)
179
c_source = PyString_AS_STRING(source)
180
c_source_size = PyString_GET_SIZE(source)
181
src = self._source_infos + source_location
183
src.size = c_source_size
185
src.agg_offset = self._source_offset + unadded_bytes
186
self._source_offset = src.agg_offset + src.size
187
# We delay creating the index on the first insert
188
if source_location != 0:
190
index = create_delta_index(src, self._index)
192
free_delta_index(self._index)
195
cdef _populate_first_index(self):
196
cdef delta_index *index
197
if len(self._sources) != 1 or self._index != NULL:
198
raise AssertionError('_populate_first_index should only be'
199
' called when we have a single source and no index yet')
201
# We know that self._index is already NULL, so whatever
202
# create_delta_index returns is fine
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
207
cdef _expand_sources(self):
208
raise RuntimeError('if we move self._source_infos, then we need to'
209
' change all of the index pointers as well.')
210
self._max_num_sources = self._max_num_sources * 2
211
self._source_infos = <source_info *>safe_realloc(self._source_infos,
213
* self._max_num_sources)
215
def make_delta(self, target_bytes, max_delta_size=0):
216
"""Create a delta from the current source to the target bytes."""
218
cdef Py_ssize_t target_size
220
cdef unsigned long delta_size
221
cdef unsigned long c_max_delta_size
223
if self._index == NULL:
224
if len(self._sources) == 0:
226
# We were just lazy about generating the index
227
self._populate_first_index()
229
if not PyString_CheckExact(target_bytes):
230
raise TypeError('target is not a str')
232
target = PyString_AS_STRING(target_bytes)
233
target_size = PyString_GET_SIZE(target_bytes)
235
# TODO: inline some of create_delta so we at least don't have to double
236
# malloc, and can instead use PyString_FromStringAndSize, to
237
# allocate the bytes into the final string
238
c_max_delta_size = max_delta_size
240
delta = create_delta(self._index,
242
&delta_size, c_max_delta_size)
245
result = PyString_FromStringAndSize(<char *>delta, delta_size)
250
def make_delta(source_bytes, target_bytes):
251
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
252
di = DeltaIndex(source_bytes)
253
return di.make_delta(target_bytes)
256
def apply_delta(source_bytes, delta_bytes):
257
"""Apply a delta generated by make_delta to source_bytes."""
259
cdef Py_ssize_t source_size
261
cdef Py_ssize_t delta_size
263
if not PyString_CheckExact(source_bytes):
264
raise TypeError('source is not a str')
265
if not PyString_CheckExact(delta_bytes):
266
raise TypeError('delta is not a str')
267
source = PyString_AS_STRING(source_bytes)
268
source_size = PyString_GET_SIZE(source_bytes)
269
delta = PyString_AS_STRING(delta_bytes)
270
delta_size = PyString_GET_SIZE(delta_bytes)
271
# Code taken from patch-delta.c, only brought here to give better error
272
# handling, and to avoid double allocating memory
273
if (delta_size < DELTA_SIZE_MIN):
274
# XXX: Invalid delta block
275
raise RuntimeError('delta_size %d smaller than min delta size %d'
276
% (delta_size, DELTA_SIZE_MIN))
278
return _apply_delta(source, source_size, delta, delta_size)
281
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
282
unsigned char cmd, unsigned int *offset, unsigned int *length) nogil:
283
"""Decode a copy instruction from the next few bytes.
285
A copy instruction is a variable number of bytes, so we will parse the
286
bytes we care about, and return the new position, as well as the offset and
287
length referred to in the bytes.
289
:param bytes: Pointer to the start of bytes after cmd
290
:param cmd: The command code
291
:return: Pointer to the bytes just after the last decode byte
293
cdef unsigned int off, size, count
301
off = off | (bytes[count] << 8)
304
off = off | (bytes[count] << 16)
307
off = off | (bytes[count] << 24)
313
size = size | (bytes[count] << 8)
316
size = size | (bytes[count] << 16)
325
cdef object _apply_delta(char *source, Py_ssize_t source_size,
326
char *delta, Py_ssize_t delta_size):
327
"""common functionality between apply_delta and apply_delta_to_source."""
328
cdef unsigned char *data, *top
329
cdef unsigned char *dst_buf, *out, cmd
331
cdef unsigned int cp_off, cp_size
334
data = <unsigned char *>delta
335
top = data + delta_size
337
# now the result size
338
size = get_delta_hdr_size(&data, top)
339
result = PyString_FromStringAndSize(NULL, size)
340
dst_buf = <unsigned char*>PyString_AS_STRING(result)
350
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
351
if (cp_off + cp_size < cp_size or
352
cp_off + cp_size > source_size or
356
memcpy(out, source + cp_off, cp_size)
358
size = size - cp_size
362
# cmd == 0 is reserved for future encoding
363
# extensions. In the mean time we must fail when
364
# encountering them (might be data corruption).
370
memcpy(out, data, cmd)
376
raise ValueError('Something wrong with:'
377
' cp_off = %s, cp_size = %s'
378
' source_size = %s, size = %s'
379
% (cp_off, cp_size, source_size, size))
381
raise ValueError('Got delta opcode: 0, not supported')
383
raise ValueError('Insert instruction longer than remaining'
384
' bytes: %d > %d' % (cmd, size))
387
if (data != top or size != 0):
388
raise RuntimeError('Did not extract the number of bytes we expected'
389
' we were left with %d bytes in "size", and top - data = %d'
390
% (size, <int>(top - data)))
393
# *dst_size = out - dst_buf;
394
if (out - dst_buf) != PyString_GET_SIZE(result):
395
raise RuntimeError('Number of bytes extracted did not match the'
396
' size encoded in the delta header.')
400
def apply_delta_to_source(source, delta_start, delta_end):
401
"""Extract a delta from source bytes, and apply it."""
403
cdef Py_ssize_t c_source_size
405
cdef Py_ssize_t c_delta_size
406
cdef Py_ssize_t c_delta_start, c_delta_end
408
if not PyString_CheckExact(source):
409
raise TypeError('source is not a str')
410
c_source_size = PyString_GET_SIZE(source)
411
c_delta_start = delta_start
412
c_delta_end = delta_end
413
if c_delta_start >= c_source_size:
414
raise ValueError('delta starts after source')
415
if c_delta_end > c_source_size:
416
raise ValueError('delta ends after source')
417
if c_delta_start >= c_delta_end:
418
raise ValueError('delta starts after it ends')
420
c_delta_size = c_delta_end - c_delta_start
421
c_source = PyString_AS_STRING(source)
422
c_delta = c_source + c_delta_start
423
# We don't use source_size, because we know the delta should not refer to
424
# any bytes after it starts
425
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
428
def encode_base128_int(val):
429
"""Convert an integer into a 7-bit lsb encoding."""
430
cdef unsigned int c_val
431
cdef Py_ssize_t count
432
cdef unsigned int num_bytes
433
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
437
while c_val >= 0x80 and count < 8:
438
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
441
if count >= 8 or c_val >= 0x80:
442
raise ValueError('encode_base128_int overflowed the buffer')
443
c_bytes[count] = <unsigned char>(c_val & 0xFF)
445
return PyString_FromStringAndSize(<char *>c_bytes, count)
448
def decode_base128_int(bytes):
449
"""Decode an integer from a 7-bit lsb encoding."""
452
cdef unsigned int uval
454
cdef Py_ssize_t num_low_bytes
455
cdef unsigned char *c_bytes
460
if not PyString_CheckExact(bytes):
461
raise TypeError('bytes is not a string')
462
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
463
# We take off 1, because we have to be able to decode the non-expanded byte
464
num_low_bytes = PyString_GET_SIZE(bytes) - 1
465
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
466
val = val | ((c_bytes[offset] & 0x7F) << shift)
469
if c_bytes[offset] & 0x80:
470
raise ValueError('Data not properly formatted, we ran out of'
471
' bytes before 0x80 stopped being set.')
472
val = val | (c_bytes[offset] << shift)
475
uval = <unsigned int> val