1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef int Py_ssize_t # Required for older pyrex versions
26
int PyString_CheckExact(object)
27
char * PyString_AS_STRING(object)
28
Py_ssize_t PyString_GET_SIZE(object)
29
object PyString_FromStringAndSize(char *, Py_ssize_t)
33
ctypedef unsigned long size_t
35
void * realloc(void *, size_t)
37
void memcpy(void *, void *, size_t)
40
cdef extern from "delta.h":
44
unsigned long agg_offset
47
delta_index * create_delta_index(source_info *src, delta_index *old)
48
delta_index * create_delta_index_from_delta(source_info *delta,
50
void free_delta_index(delta_index *index)
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size)
54
unsigned long get_delta_hdr_size(unsigned char **datap,
56
Py_ssize_t DELTA_SIZE_MIN
57
void *patch_delta(void *src_buf, unsigned long src_size,
58
void *delta_buf, unsigned long delta_size,
59
unsigned long *dst_size)
62
cdef void *safe_malloc(size_t count) except NULL:
64
result = malloc(count)
66
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
72
result = realloc(old, count)
74
raise MemoryError('Failed to reallocate to %d bytes of memory'
79
cdef int safe_free(void **val) except -1:
85
def make_delta_index(source):
86
return DeltaIndex(source)
89
cdef class DeltaIndex:
91
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
92
# isn't performance critical
93
# cdef readonly list _sources
94
cdef readonly object _sources
95
cdef source_info *_source_infos
96
cdef delta_index *_index
97
cdef readonly unsigned int _max_num_sources
98
cdef public unsigned long _source_offset
100
def __init__(self, source=None):
103
self._max_num_sources = 65000
104
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
105
* self._max_num_sources)
106
self._source_offset = 0
108
if source is not None:
109
self.add_source(source, 0)
112
return '%s(%d, %d)' % (self.__class__.__name__,
113
len(self._sources), self._source_offset)
115
def __dealloc__(self):
116
if self._index != NULL:
117
free_delta_index(self._index)
119
safe_free(<void **>&self._source_infos)
121
def add_delta_source(self, delta, unadded_bytes):
122
"""Add a new delta to the source texts.
124
:param delta: The text of the delta, this must be a byte string.
125
:param unadded_bytes: Number of bytes that were added to the source
126
that were not indexed.
129
cdef Py_ssize_t c_delta_size
130
cdef delta_index *index
131
cdef unsigned int source_location
132
cdef source_info *src
133
cdef unsigned int num_indexes
135
if not PyString_CheckExact(delta):
136
raise TypeError('delta is not a str')
138
source_location = len(self._sources)
139
if source_location >= self._max_num_sources:
140
self._expand_sources()
141
self._sources.append(delta)
142
c_delta = PyString_AS_STRING(delta)
143
c_delta_size = PyString_GET_SIZE(delta)
144
src = self._source_infos + source_location
146
src.size = c_delta_size
147
src.agg_offset = self._source_offset + unadded_bytes
148
index = create_delta_index_from_delta(src, self._index)
149
self._source_offset = src.agg_offset + src.size
151
free_delta_index(self._index)
154
def add_source(self, source, unadded_bytes):
155
"""Add a new bit of source text to the delta indexes.
157
:param source: The text in question, this must be a byte string
158
:param unadded_bytes: Assume there are this many bytes that didn't get
159
added between this source and the end of the previous source.
162
cdef Py_ssize_t c_source_size
163
cdef delta_index *index
164
cdef unsigned int source_location
165
cdef source_info *src
166
cdef unsigned int num_indexes
168
if not PyString_CheckExact(source):
169
raise TypeError('source is not a str')
171
source_location = len(self._sources)
172
if source_location >= self._max_num_sources:
173
self._expand_sources()
174
self._sources.append(source)
175
c_source = PyString_AS_STRING(source)
176
c_source_size = PyString_GET_SIZE(source)
177
src = self._source_infos + source_location
179
src.size = c_source_size
181
src.agg_offset = self._source_offset + unadded_bytes
182
index = create_delta_index(src, self._index)
183
self._source_offset = src.agg_offset + src.size
185
free_delta_index(self._index)
188
cdef _expand_sources(self):
189
raise RuntimeError('if we move self._source_infos, then we need to'
190
' change all of the index pointers as well.')
191
self._max_num_sources = self._max_num_sources * 2
192
self._source_infos = <source_info *>safe_realloc(self._source_infos,
194
* self._max_num_sources)
196
def make_delta(self, target_bytes, max_delta_size=0):
197
"""Create a delta from the current source to the target bytes."""
199
cdef Py_ssize_t target_size
201
cdef unsigned long delta_size
203
if self._index == NULL:
206
if not PyString_CheckExact(target_bytes):
207
raise TypeError('target is not a str')
209
target = PyString_AS_STRING(target_bytes)
210
target_size = PyString_GET_SIZE(target_bytes)
212
# TODO: inline some of create_delta so we at least don't have to double
213
# malloc, and can instead use PyString_FromStringAndSize, to
214
# allocate the bytes into the final string
215
delta = create_delta(self._index,
217
&delta_size, max_delta_size)
220
result = PyString_FromStringAndSize(<char *>delta, delta_size)
225
def make_delta(source_bytes, target_bytes):
226
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
227
di = DeltaIndex(source_bytes)
228
return di.make_delta(target_bytes)
231
def apply_delta(source_bytes, delta_bytes):
232
"""Apply a delta generated by make_delta to source_bytes."""
234
cdef Py_ssize_t source_size
236
cdef Py_ssize_t delta_size
238
if not PyString_CheckExact(source_bytes):
239
raise TypeError('source is not a str')
240
if not PyString_CheckExact(delta_bytes):
241
raise TypeError('delta is not a str')
242
source = PyString_AS_STRING(source_bytes)
243
source_size = PyString_GET_SIZE(source_bytes)
244
delta = PyString_AS_STRING(delta_bytes)
245
delta_size = PyString_GET_SIZE(delta_bytes)
246
# Code taken from patch-delta.c, only brought here to give better error
247
# handling, and to avoid double allocating memory
248
if (delta_size < DELTA_SIZE_MIN):
249
# XXX: Invalid delta block
250
raise RuntimeError('delta_size %d smaller than min delta size %d'
251
% (delta_size, DELTA_SIZE_MIN))
253
return _apply_delta(source, source_size, delta, delta_size)
256
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
257
unsigned char cmd, unsigned int *offset, unsigned int *length):
258
"""Decode a copy instruction from the next few bytes.
260
A copy instruction is a variable number of bytes, so we will parse the
261
bytes we care about, and return the new position, as well as the offset and
262
length referred to in the bytes.
264
:param bytes: Pointer to the start of bytes after cmd
265
:param cmd: The command code
266
:return: Pointer to the bytes just after the last decode byte
268
cdef unsigned int off, size, count
276
off = off | (bytes[count] << 8)
279
off = off | (bytes[count] << 16)
282
off = off | (bytes[count] << 24)
288
size = size | (bytes[count] << 8)
291
size = size | (bytes[count] << 16)
300
cdef object _apply_delta(char *source, Py_ssize_t source_size,
301
char *delta, Py_ssize_t delta_size):
302
"""common functionality between apply_delta and apply_delta_to_source."""
303
cdef unsigned char *data, *top
304
cdef unsigned char *dst_buf, *out, cmd
306
cdef unsigned int cp_off, cp_size
308
data = <unsigned char *>delta
309
top = data + delta_size
311
# now the result size
312
size = get_delta_hdr_size(&data, top)
313
result = PyString_FromStringAndSize(NULL, size)
314
dst_buf = <unsigned char*>PyString_AS_STRING(result)
322
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
323
if (cp_off + cp_size < cp_size or
324
cp_off + cp_size > source_size or
326
raise RuntimeError('Something wrong with:'
327
' cp_off = %s, cp_size = %s'
328
' source_size = %s, size = %s'
329
% (cp_off, cp_size, source_size, size))
330
memcpy(out, source + cp_off, cp_size)
332
size = size - cp_size
336
# cmd == 0 is reserved for future encoding
337
# extensions. In the mean time we must fail when
338
# encountering them (might be data corruption).
339
raise RuntimeError('Got delta opcode: 0, not supported')
341
raise RuntimeError('Insert instruction longer than remaining'
342
' bytes: %d > %d' % (cmd, size))
343
memcpy(out, data, cmd)
349
if (data != top or size != 0):
350
raise RuntimeError('Did not extract the number of bytes we expected'
351
' we were left with %d bytes in "size", and top - data = %d'
352
% (size, <int>(top - data)))
355
# *dst_size = out - dst_buf;
356
if (out - dst_buf) != PyString_GET_SIZE(result):
357
raise RuntimeError('Number of bytes extracted did not match the'
358
' size encoded in the delta header.')
362
def apply_delta_to_source(source, delta_start, delta_end):
363
"""Extract a delta from source bytes, and apply it."""
365
cdef Py_ssize_t c_source_size
367
cdef Py_ssize_t c_delta_size
368
cdef Py_ssize_t c_delta_start, c_delta_end
370
if not PyString_CheckExact(source):
371
raise TypeError('source is not a str')
372
c_source_size = PyString_GET_SIZE(source)
373
c_delta_start = delta_start
374
c_delta_end = delta_end
375
if c_delta_start >= c_source_size:
376
raise ValueError('delta starts after source')
377
if c_delta_end > c_source_size:
378
raise ValueError('delta ends after source')
379
if c_delta_start >= c_delta_end:
380
raise ValueError('delta starts after it ends')
382
c_delta_size = c_delta_end - c_delta_start
383
c_source = PyString_AS_STRING(source)
384
c_delta = c_source + c_delta_start
385
# We don't use source_size, because we know the delta should not refer to
386
# any bytes after it starts
387
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
390
def encode_base128_int(val):
391
"""Convert an integer into a 7-bit lsb encoding."""
392
cdef unsigned int c_val
393
cdef Py_ssize_t count
394
cdef unsigned int num_bytes
395
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
399
while c_val >= 0x80 and count < 8:
400
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
403
if count >= 8 or c_val >= 0x80:
404
raise ValueError('encode_base128_int overflowed the buffer')
405
c_bytes[count] = <unsigned char>(c_val & 0xFF)
407
return PyString_FromStringAndSize(<char *>c_bytes, count)
410
def decode_base128_int(bytes):
411
"""Decode an integer from a 7-bit lsb encoding."""
414
cdef unsigned int uval
416
cdef Py_ssize_t num_low_bytes
417
cdef unsigned char *c_bytes
422
if not PyString_CheckExact(bytes):
423
raise TypeError('bytes is not a string')
424
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
425
# We take off 1, because we have to be able to decode the non-expanded byte
426
num_low_bytes = PyString_GET_SIZE(bytes) - 1
427
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
428
val = val | ((c_bytes[offset] & 0x7F) << shift)
431
if c_bytes[offset] & 0x80:
432
raise ValueError('Data not properly formatted, we ran out of'
433
' bytes before 0x80 stopped being set.')
434
val = val | (c_bytes[offset] << shift)
437
uval = <unsigned int> val