1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef int Py_ssize_t # Required for older pyrex versions
26
int PyString_CheckExact(object)
27
char * PyString_AS_STRING(object)
28
Py_ssize_t PyString_GET_SIZE(object)
29
object PyString_FromStringAndSize(char *, Py_ssize_t)
33
ctypedef unsigned long size_t
34
void * malloc(size_t) nogil
35
void * realloc(void *, size_t) nogil
36
void free(void *) nogil
37
void memcpy(void *, void *, size_t) nogil
40
cdef extern from "delta.h":
44
unsigned long agg_offset
47
delta_index * create_delta_index(source_info *src, delta_index *old) nogil
48
delta_index * create_delta_index_from_delta(source_info *delta,
49
delta_index *old) nogil
50
void free_delta_index(delta_index *index) nogil
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size) nogil
54
unsigned long get_delta_hdr_size(unsigned char **datap,
55
unsigned char *top) nogil
56
Py_ssize_t DELTA_SIZE_MIN
59
cdef void *safe_malloc(size_t count) except NULL:
61
result = malloc(count)
63
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
cdef void *safe_realloc(void * old, size_t count) except NULL:
69
result = realloc(old, count)
71
raise MemoryError('Failed to reallocate to %d bytes of memory'
76
cdef int safe_free(void **val) except -1:
82
def make_delta_index(source):
83
return DeltaIndex(source)
86
cdef class DeltaIndex:
88
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
89
# isn't performance critical
90
# cdef readonly list _sources
91
cdef readonly object _sources
92
cdef source_info *_source_infos
93
cdef delta_index *_index
94
cdef readonly unsigned int _max_num_sources
95
cdef public unsigned long _source_offset
97
def __init__(self, source=None):
100
self._max_num_sources = 65000
101
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
102
* self._max_num_sources)
103
self._source_offset = 0
105
if source is not None:
106
self.add_source(source, 0)
109
return '%s(%d, %d)' % (self.__class__.__name__,
110
len(self._sources), self._source_offset)
112
def __dealloc__(self):
113
if self._index != NULL:
114
free_delta_index(self._index)
116
safe_free(<void **>&self._source_infos)
118
def _has_index(self):
119
return (self._index != NULL)
121
def add_delta_source(self, delta, unadded_bytes):
122
"""Add a new delta to the source texts.
124
:param delta: The text of the delta, this must be a byte string.
125
:param unadded_bytes: Number of bytes that were added to the source
126
that were not indexed.
129
cdef Py_ssize_t c_delta_size
130
cdef delta_index *index
131
cdef unsigned int source_location
132
cdef source_info *src
133
cdef unsigned int num_indexes
135
if not PyString_CheckExact(delta):
136
raise TypeError('delta is not a str')
138
source_location = len(self._sources)
139
if source_location >= self._max_num_sources:
140
self._expand_sources()
141
self._sources.append(delta)
142
c_delta = PyString_AS_STRING(delta)
143
c_delta_size = PyString_GET_SIZE(delta)
144
src = self._source_infos + source_location
146
src.size = c_delta_size
147
src.agg_offset = self._source_offset + unadded_bytes
149
index = create_delta_index_from_delta(src, self._index)
150
self._source_offset = src.agg_offset + src.size
152
free_delta_index(self._index)
155
def add_source(self, source, unadded_bytes):
156
"""Add a new bit of source text to the delta indexes.
158
:param source: The text in question, this must be a byte string
159
:param unadded_bytes: Assume there are this many bytes that didn't get
160
added between this source and the end of the previous source.
163
cdef Py_ssize_t c_source_size
164
cdef delta_index *index
165
cdef unsigned int source_location
166
cdef source_info *src
167
cdef unsigned int num_indexes
169
if not PyString_CheckExact(source):
170
raise TypeError('source is not a str')
172
source_location = len(self._sources)
173
if source_location >= self._max_num_sources:
174
self._expand_sources()
175
if source_location != 0 and self._index == NULL:
176
# We were lazy about populating the index, create it now
177
self._populate_first_index()
178
self._sources.append(source)
179
c_source = PyString_AS_STRING(source)
180
c_source_size = PyString_GET_SIZE(source)
181
src = self._source_infos + source_location
183
src.size = c_source_size
185
src.agg_offset = self._source_offset + unadded_bytes
186
self._source_offset = src.agg_offset + src.size
187
# We delay creating the index on the first insert
188
if source_location != 0:
190
index = create_delta_index(src, self._index)
192
free_delta_index(self._index)
195
cdef _populate_first_index(self):
196
cdef delta_index *index
197
if len(self._sources) != 1 or self._index != NULL:
198
raise AssertionError('_populate_first_index should only be'
199
' called when we have a single source and no index yet')
201
# We know that self._index is already NULL, so whatever
202
# create_delta_index returns is fine
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
207
cdef _expand_sources(self):
208
raise RuntimeError('if we move self._source_infos, then we need to'
209
' change all of the index pointers as well.')
210
self._max_num_sources = self._max_num_sources * 2
211
self._source_infos = <source_info *>safe_realloc(self._source_infos,
213
* self._max_num_sources)
215
def make_delta(self, target_bytes, max_delta_size=0):
216
"""Create a delta from the current source to the target bytes."""
218
cdef Py_ssize_t target_size
220
cdef unsigned long delta_size
221
cdef unsigned long c_max_delta_size
223
if self._index == NULL:
224
if len(self._sources) == 0:
226
# We were just lazy about generating the index
227
self._populate_first_index()
229
if not PyString_CheckExact(target_bytes):
230
raise TypeError('target is not a str')
232
target = PyString_AS_STRING(target_bytes)
233
target_size = PyString_GET_SIZE(target_bytes)
235
# TODO: inline some of create_delta so we at least don't have to double
236
# malloc, and can instead use PyString_FromStringAndSize, to
237
# allocate the bytes into the final string
238
c_max_delta_size = max_delta_size
240
delta = create_delta(self._index,
242
&delta_size, c_max_delta_size)
245
result = PyString_FromStringAndSize(<char *>delta, delta_size)
250
def make_delta(source_bytes, target_bytes):
251
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
252
di = DeltaIndex(source_bytes)
253
return di.make_delta(target_bytes)
256
def apply_delta(source_bytes, delta_bytes):
257
"""Apply a delta generated by make_delta to source_bytes."""
259
cdef Py_ssize_t source_size
261
cdef Py_ssize_t delta_size
263
if not PyString_CheckExact(source_bytes):
264
raise TypeError('source is not a str')
265
if not PyString_CheckExact(delta_bytes):
266
raise TypeError('delta is not a str')
267
source = PyString_AS_STRING(source_bytes)
268
source_size = PyString_GET_SIZE(source_bytes)
269
delta = PyString_AS_STRING(delta_bytes)
270
delta_size = PyString_GET_SIZE(delta_bytes)
271
# Code taken from patch-delta.c, only brought here to give better error
272
# handling, and to avoid double allocating memory
273
if (delta_size < DELTA_SIZE_MIN):
274
# XXX: Invalid delta block
275
raise RuntimeError('delta_size %d smaller than min delta size %d'
276
% (delta_size, DELTA_SIZE_MIN))
278
return _apply_delta(source, source_size, delta, delta_size)
281
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
282
unsigned char cmd, unsigned int *offset,
283
unsigned int *length) nogil: # cannot_raise
284
"""Decode a copy instruction from the next few bytes.
286
A copy instruction is a variable number of bytes, so we will parse the
287
bytes we care about, and return the new position, as well as the offset and
288
length referred to in the bytes.
290
:param bytes: Pointer to the start of bytes after cmd
291
:param cmd: The command code
292
:return: Pointer to the bytes just after the last decode byte
294
cdef unsigned int off, size, count
302
off = off | (bytes[count] << 8)
305
off = off | (bytes[count] << 16)
308
off = off | (bytes[count] << 24)
314
size = size | (bytes[count] << 8)
317
size = size | (bytes[count] << 16)
326
cdef object _apply_delta(char *source, Py_ssize_t source_size,
327
char *delta, Py_ssize_t delta_size):
328
"""common functionality between apply_delta and apply_delta_to_source."""
329
cdef unsigned char *data, *top
330
cdef unsigned char *dst_buf, *out, cmd
332
cdef unsigned int cp_off, cp_size
335
data = <unsigned char *>delta
336
top = data + delta_size
338
# now the result size
339
size = get_delta_hdr_size(&data, top)
340
result = PyString_FromStringAndSize(NULL, size)
341
dst_buf = <unsigned char*>PyString_AS_STRING(result)
351
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
352
if (cp_off + cp_size < cp_size or
353
cp_off + cp_size > source_size or
357
memcpy(out, source + cp_off, cp_size)
359
size = size - cp_size
363
# cmd == 0 is reserved for future encoding
364
# extensions. In the mean time we must fail when
365
# encountering them (might be data corruption).
371
memcpy(out, data, cmd)
377
raise ValueError('Something wrong with:'
378
' cp_off = %s, cp_size = %s'
379
' source_size = %s, size = %s'
380
% (cp_off, cp_size, source_size, size))
382
raise ValueError('Got delta opcode: 0, not supported')
384
raise ValueError('Insert instruction longer than remaining'
385
' bytes: %d > %d' % (cmd, size))
388
if (data != top or size != 0):
389
raise RuntimeError('Did not extract the number of bytes we expected'
390
' we were left with %d bytes in "size", and top - data = %d'
391
% (size, <int>(top - data)))
394
# *dst_size = out - dst_buf;
395
if (out - dst_buf) != PyString_GET_SIZE(result):
396
raise RuntimeError('Number of bytes extracted did not match the'
397
' size encoded in the delta header.')
401
def apply_delta_to_source(source, delta_start, delta_end):
402
"""Extract a delta from source bytes, and apply it."""
404
cdef Py_ssize_t c_source_size
406
cdef Py_ssize_t c_delta_size
407
cdef Py_ssize_t c_delta_start, c_delta_end
409
if not PyString_CheckExact(source):
410
raise TypeError('source is not a str')
411
c_source_size = PyString_GET_SIZE(source)
412
c_delta_start = delta_start
413
c_delta_end = delta_end
414
if c_delta_start >= c_source_size:
415
raise ValueError('delta starts after source')
416
if c_delta_end > c_source_size:
417
raise ValueError('delta ends after source')
418
if c_delta_start >= c_delta_end:
419
raise ValueError('delta starts after it ends')
421
c_delta_size = c_delta_end - c_delta_start
422
c_source = PyString_AS_STRING(source)
423
c_delta = c_source + c_delta_start
424
# We don't use source_size, because we know the delta should not refer to
425
# any bytes after it starts
426
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
429
def encode_base128_int(val):
430
"""Convert an integer into a 7-bit lsb encoding."""
431
cdef unsigned int c_val
432
cdef Py_ssize_t count
433
cdef unsigned int num_bytes
434
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
438
while c_val >= 0x80 and count < 8:
439
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
442
if count >= 8 or c_val >= 0x80:
443
raise ValueError('encode_base128_int overflowed the buffer')
444
c_bytes[count] = <unsigned char>(c_val & 0xFF)
446
return PyString_FromStringAndSize(<char *>c_bytes, count)
449
def decode_base128_int(bytes):
450
"""Decode an integer from a 7-bit lsb encoding."""
453
cdef unsigned int uval
455
cdef Py_ssize_t num_low_bytes
456
cdef unsigned char *c_bytes
461
if not PyString_CheckExact(bytes):
462
raise TypeError('bytes is not a string')
463
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
464
# We take off 1, because we have to be able to decode the non-expanded byte
465
num_low_bytes = PyString_GET_SIZE(bytes) - 1
466
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
467
val = val | ((c_bytes[offset] & 0x7F) << shift)
470
if c_bytes[offset] & 0x80:
471
raise ValueError('Data not properly formatted, we ran out of'
472
' bytes before 0x80 stopped being set.')
473
val = val | (c_bytes[offset] << shift)
476
uval = <unsigned int> val