1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef int Py_ssize_t # Required for older pyrex versions
26
int PyString_CheckExact(object)
27
char * PyString_AS_STRING(object)
28
Py_ssize_t PyString_GET_SIZE(object)
29
object PyString_FromStringAndSize(char *, Py_ssize_t)
33
ctypedef unsigned long size_t
35
void * realloc(void *, size_t)
37
void memcpy(void *, void *, size_t)
40
cdef extern from "delta.h":
44
unsigned long agg_offset
47
delta_index * create_delta_index(source_info *src, delta_index *old)
48
delta_index * create_delta_index_from_delta(source_info *delta,
50
void free_delta_index(delta_index *index)
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size)
54
unsigned long get_delta_hdr_size(unsigned char **datap,
56
Py_ssize_t DELTA_SIZE_MIN
57
void *patch_delta(void *src_buf, unsigned long src_size,
58
void *delta_buf, unsigned long delta_size,
59
unsigned long *dst_size)
62
cdef void *safe_malloc(size_t count) except NULL:
64
result = malloc(count)
66
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
72
result = realloc(old, count)
74
raise MemoryError('Failed to reallocate to %d bytes of memory'
79
cdef int safe_free(void **val) except -1:
85
def make_delta_index(source):
86
return DeltaIndex(source)
89
cdef class DeltaIndex:
91
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
92
# isn't performance critical
93
# cdef readonly list _sources
94
cdef readonly object _sources
95
cdef source_info *_source_infos
96
cdef delta_index *_index
97
cdef readonly unsigned int _max_num_sources
98
cdef public unsigned long _source_offset
100
def __init__(self, source=None):
103
self._max_num_sources = 65000
104
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
105
* self._max_num_sources)
106
self._source_offset = 0
108
if source is not None:
109
self.add_source(source, 0)
112
return '%s(%d, %d)' % (self.__class__.__name__,
113
len(self._sources), self._source_offset)
115
def __dealloc__(self):
116
if self._index != NULL:
117
free_delta_index(self._index)
119
safe_free(<void **>&self._source_infos)
121
def _has_index(self):
122
return (self._index != NULL)
124
def add_delta_source(self, delta, unadded_bytes):
125
"""Add a new delta to the source texts.
127
:param delta: The text of the delta, this must be a byte string.
128
:param unadded_bytes: Number of bytes that were added to the source
129
that were not indexed.
132
cdef Py_ssize_t c_delta_size
133
cdef delta_index *index
134
cdef unsigned int source_location
135
cdef source_info *src
136
cdef unsigned int num_indexes
138
if not PyString_CheckExact(delta):
139
raise TypeError('delta is not a str')
141
source_location = len(self._sources)
142
if source_location >= self._max_num_sources:
143
self._expand_sources()
144
self._sources.append(delta)
145
c_delta = PyString_AS_STRING(delta)
146
c_delta_size = PyString_GET_SIZE(delta)
147
src = self._source_infos + source_location
149
src.size = c_delta_size
150
src.agg_offset = self._source_offset + unadded_bytes
151
index = create_delta_index_from_delta(src, self._index)
152
self._source_offset = src.agg_offset + src.size
154
free_delta_index(self._index)
157
def add_source(self, source, unadded_bytes):
158
"""Add a new bit of source text to the delta indexes.
160
:param source: The text in question, this must be a byte string
161
:param unadded_bytes: Assume there are this many bytes that didn't get
162
added between this source and the end of the previous source.
165
cdef Py_ssize_t c_source_size
166
cdef delta_index *index
167
cdef unsigned int source_location
168
cdef source_info *src
169
cdef unsigned int num_indexes
171
if not PyString_CheckExact(source):
172
raise TypeError('source is not a str')
174
source_location = len(self._sources)
175
if source_location >= self._max_num_sources:
176
self._expand_sources()
177
if source_location != 0 and self._index == NULL:
178
# We were lazy about populating the index, create it now
179
self._populate_first_index()
180
self._sources.append(source)
181
c_source = PyString_AS_STRING(source)
182
c_source_size = PyString_GET_SIZE(source)
183
src = self._source_infos + source_location
185
src.size = c_source_size
187
src.agg_offset = self._source_offset + unadded_bytes
188
self._source_offset = src.agg_offset + src.size
189
# We delay creating the index on the first insert
190
if source_location != 0:
191
index = create_delta_index(src, self._index)
193
free_delta_index(self._index)
196
cdef _populate_first_index(self):
197
cdef delta_index *index
198
if len(self._sources) != 1 or self._index != NULL:
199
raise AssertionError('_populate_first_index should only be'
200
' called when we have a single source and no index yet')
202
# We know that self._index is already NULL, so whatever
203
# create_delta_index returns is fine
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
207
cdef _expand_sources(self):
208
raise RuntimeError('if we move self._source_infos, then we need to'
209
' change all of the index pointers as well.')
210
self._max_num_sources = self._max_num_sources * 2
211
self._source_infos = <source_info *>safe_realloc(self._source_infos,
213
* self._max_num_sources)
215
def make_delta(self, target_bytes, max_delta_size=0):
216
"""Create a delta from the current source to the target bytes."""
218
cdef Py_ssize_t target_size
220
cdef unsigned long delta_size
222
if self._index == NULL:
223
if len(self._sources) == 0:
225
# We were just lazy about generating the index
226
self._populate_first_index()
228
if not PyString_CheckExact(target_bytes):
229
raise TypeError('target is not a str')
231
target = PyString_AS_STRING(target_bytes)
232
target_size = PyString_GET_SIZE(target_bytes)
234
# TODO: inline some of create_delta so we at least don't have to double
235
# malloc, and can instead use PyString_FromStringAndSize, to
236
# allocate the bytes into the final string
237
delta = create_delta(self._index,
239
&delta_size, max_delta_size)
242
result = PyString_FromStringAndSize(<char *>delta, delta_size)
247
def make_delta(source_bytes, target_bytes):
248
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
249
di = DeltaIndex(source_bytes)
250
return di.make_delta(target_bytes)
253
def apply_delta(source_bytes, delta_bytes):
254
"""Apply a delta generated by make_delta to source_bytes."""
256
cdef Py_ssize_t source_size
258
cdef Py_ssize_t delta_size
260
if not PyString_CheckExact(source_bytes):
261
raise TypeError('source is not a str')
262
if not PyString_CheckExact(delta_bytes):
263
raise TypeError('delta is not a str')
264
source = PyString_AS_STRING(source_bytes)
265
source_size = PyString_GET_SIZE(source_bytes)
266
delta = PyString_AS_STRING(delta_bytes)
267
delta_size = PyString_GET_SIZE(delta_bytes)
268
# Code taken from patch-delta.c, only brought here to give better error
269
# handling, and to avoid double allocating memory
270
if (delta_size < DELTA_SIZE_MIN):
271
# XXX: Invalid delta block
272
raise RuntimeError('delta_size %d smaller than min delta size %d'
273
% (delta_size, DELTA_SIZE_MIN))
275
return _apply_delta(source, source_size, delta, delta_size)
278
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
279
unsigned char cmd, unsigned int *offset, unsigned int *length): # cannot_raise
280
"""Decode a copy instruction from the next few bytes.
282
A copy instruction is a variable number of bytes, so we will parse the
283
bytes we care about, and return the new position, as well as the offset and
284
length referred to in the bytes.
286
:param bytes: Pointer to the start of bytes after cmd
287
:param cmd: The command code
288
:return: Pointer to the bytes just after the last decode byte
290
cdef unsigned int off, size, count
298
off = off | (bytes[count] << 8)
301
off = off | (bytes[count] << 16)
304
off = off | (bytes[count] << 24)
310
size = size | (bytes[count] << 8)
313
size = size | (bytes[count] << 16)
322
cdef object _apply_delta(char *source, Py_ssize_t source_size,
323
char *delta, Py_ssize_t delta_size):
324
"""common functionality between apply_delta and apply_delta_to_source."""
325
cdef unsigned char *data, *top
326
cdef unsigned char *dst_buf, *out, cmd
328
cdef unsigned int cp_off, cp_size
330
data = <unsigned char *>delta
331
top = data + delta_size
333
# now the result size
334
size = get_delta_hdr_size(&data, top)
335
result = PyString_FromStringAndSize(NULL, size)
336
dst_buf = <unsigned char*>PyString_AS_STRING(result)
344
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
345
if (cp_off + cp_size < cp_size or
346
cp_off + cp_size > source_size or
348
raise RuntimeError('Something wrong with:'
349
' cp_off = %s, cp_size = %s'
350
' source_size = %s, size = %s'
351
% (cp_off, cp_size, source_size, size))
352
memcpy(out, source + cp_off, cp_size)
354
size = size - cp_size
358
# cmd == 0 is reserved for future encoding
359
# extensions. In the mean time we must fail when
360
# encountering them (might be data corruption).
361
raise RuntimeError('Got delta opcode: 0, not supported')
363
raise RuntimeError('Insert instruction longer than remaining'
364
' bytes: %d > %d' % (cmd, size))
365
memcpy(out, data, cmd)
371
if (data != top or size != 0):
372
raise RuntimeError('Did not extract the number of bytes we expected'
373
' we were left with %d bytes in "size", and top - data = %d'
374
% (size, <int>(top - data)))
377
# *dst_size = out - dst_buf;
378
if (out - dst_buf) != PyString_GET_SIZE(result):
379
raise RuntimeError('Number of bytes extracted did not match the'
380
' size encoded in the delta header.')
384
def apply_delta_to_source(source, delta_start, delta_end):
385
"""Extract a delta from source bytes, and apply it."""
387
cdef Py_ssize_t c_source_size
389
cdef Py_ssize_t c_delta_size
390
cdef Py_ssize_t c_delta_start, c_delta_end
392
if not PyString_CheckExact(source):
393
raise TypeError('source is not a str')
394
c_source_size = PyString_GET_SIZE(source)
395
c_delta_start = delta_start
396
c_delta_end = delta_end
397
if c_delta_start >= c_source_size:
398
raise ValueError('delta starts after source')
399
if c_delta_end > c_source_size:
400
raise ValueError('delta ends after source')
401
if c_delta_start >= c_delta_end:
402
raise ValueError('delta starts after it ends')
404
c_delta_size = c_delta_end - c_delta_start
405
c_source = PyString_AS_STRING(source)
406
c_delta = c_source + c_delta_start
407
# We don't use source_size, because we know the delta should not refer to
408
# any bytes after it starts
409
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
412
def encode_base128_int(val):
413
"""Convert an integer into a 7-bit lsb encoding."""
414
cdef unsigned int c_val
415
cdef Py_ssize_t count
416
cdef unsigned int num_bytes
417
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
421
while c_val >= 0x80 and count < 8:
422
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
425
if count >= 8 or c_val >= 0x80:
426
raise ValueError('encode_base128_int overflowed the buffer')
427
c_bytes[count] = <unsigned char>(c_val & 0xFF)
429
return PyString_FromStringAndSize(<char *>c_bytes, count)
432
def decode_base128_int(bytes):
433
"""Decode an integer from a 7-bit lsb encoding."""
436
cdef unsigned int uval
438
cdef Py_ssize_t num_low_bytes
439
cdef unsigned char *c_bytes
444
if not PyString_CheckExact(bytes):
445
raise TypeError('bytes is not a string')
446
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
447
# We take off 1, because we have to be able to decode the non-expanded byte
448
num_low_bytes = PyString_GET_SIZE(bytes) - 1
449
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
450
val = val | ((c_bytes[offset] & 0x7F) << shift)
453
if c_bytes[offset] & 0x80:
454
raise ValueError('Data not properly formatted, we ran out of'
455
' bytes before 0x80 stopped being set.')
456
val = val | (c_bytes[offset] << shift)
459
uval = <unsigned int> val