0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
1 |
# Copyright (C) 2008 Canonical Limited.
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License version 2 as published
|
|
5 |
# by the Free Software Foundation.
|
|
6 |
#
|
|
7 |
# This program is distributed in the hope that it will be useful,
|
|
8 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
# GNU General Public License for more details.
|
|
11 |
#
|
|
12 |
# You should have received a copy of the GNU General Public License
|
|
13 |
# along with this program; if not, write to the Free Software
|
|
14 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
15 |
#
|
|
16 |
||
17 |
"""Compiled extensions for doing compression."""
|
|
18 |
||
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
19 |
cdef extern from *: |
20 |
ctypedef unsigned long size_t |
|
21 |
void * malloc(size_t) |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
22 |
void * realloc(void *, size_t) |
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
23 |
void free(void *) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
24 |
void memcpy(void *, void *, size_t) |
25 |
||
26 |
cdef extern from "delta.h": |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
27 |
struct source_info: |
28 |
void *buf |
|
29 |
unsigned long size |
|
30 |
unsigned long agg_offset |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
31 |
struct delta_index: |
32 |
unsigned long memsize |
|
33 |
void *src_buf |
|
34 |
unsigned long src_size |
|
35 |
unsigned int hash_mask |
|
36 |
# struct index_entry *hash[]
|
|
0.23.42
by John Arbash Meinel
Change the code around again. |
37 |
delta_index * create_delta_index(source_info *src) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
38 |
void free_delta_index(delta_index *index) |
0.23.24
by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against. |
39 |
void *create_delta(delta_index **indexes, |
40 |
unsigned int num_indexes, |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
41 |
void *buf, unsigned long bufsize, |
42 |
unsigned long *delta_size, unsigned long max_delta_size) |
|
43 |
unsigned long get_delta_hdr_size(unsigned char **datap, |
|
44 |
unsigned char *top) |
|
45 |
Py_ssize_t DELTA_SIZE_MIN |
|
0.23.7
by John Arbash Meinel
Add a apply_delta2 function, just in case it matters. |
46 |
void *patch_delta(void *src_buf, unsigned long src_size, |
47 |
void *delta_buf, unsigned long delta_size, |
|
48 |
unsigned long *dst_size) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
49 |
|
50 |
cdef extern from "Python.h": |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
51 |
int PyString_CheckExact(object) |
52 |
char * PyString_AS_STRING(object) |
|
53 |
Py_ssize_t PyString_GET_SIZE(object) |
|
54 |
object PyString_FromStringAndSize(char *, Py_ssize_t) |
|
55 |
||
56 |
||
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
57 |
cdef void *safe_malloc(size_t count) except NULL: |
58 |
cdef void *result |
|
59 |
result = malloc(count) |
|
60 |
if result == NULL: |
|
61 |
raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) |
|
62 |
return result |
|
63 |
||
64 |
||
65 |
cdef void *safe_realloc(void * old, size_t count) except NULL: |
|
66 |
cdef void *result |
|
67 |
result = realloc(old, count) |
|
68 |
if result == NULL: |
|
69 |
raise MemoryError('Failed to reallocate to %d bytes of memory' |
|
70 |
% (count,)) |
|
71 |
return result |
|
72 |
||
73 |
||
74 |
cdef int safe_free(void **val) except -1: |
|
75 |
assert val != NULL |
|
76 |
if val[0] != NULL: |
|
77 |
free(val[0]) |
|
78 |
val[0] = NULL |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
79 |
|
0.23.17
by John Arbash Meinel
Create a wrapper function, so that lsprof will properly attribute time spent. |
80 |
def make_delta_index(source): |
81 |
return DeltaIndex(source) |
|
82 |
||
83 |
||
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
84 |
cdef class DeltaIndex: |
85 |
||
0.23.40
by John Arbash Meinel
Add a comment why we aren't using the list type for _sources |
86 |
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
|
87 |
# isn't performance critical
|
|
88 |
# cdef readonly list _sources
|
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
89 |
cdef readonly object _sources |
0.23.42
by John Arbash Meinel
Change the code around again. |
90 |
cdef source_info *_source_infos |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
91 |
cdef delta_index **_indexes |
92 |
cdef readonly unsigned int _num_indexes |
|
93 |
cdef readonly unsigned int _max_num_indexes |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
94 |
cdef readonly unsigned int _max_num_sources |
0.23.32
by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta. |
95 |
cdef public unsigned long _source_offset |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
96 |
|
97 |
def __repr__(self): |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
98 |
return '%s(%d, %d, %d)' % (self.__class__.__name__, |
99 |
len(self._sources), self._source_offset, |
|
100 |
self._num_indexes) |
|
101 |
||
102 |
def __init__(self, source=None): |
|
103 |
self._sources = [] |
|
104 |
self._max_num_indexes = 1024 |
|
105 |
self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*) |
|
106 |
* self._max_num_indexes) |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
107 |
self._max_num_sources = 1024 |
108 |
self._source_infos = <source_info *>safe_malloc(sizeof(source_info) |
|
109 |
* self._max_num_sources) |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
110 |
self._num_indexes = 0 |
111 |
self._source_offset = 0 |
|
112 |
||
113 |
if source is not None: |
|
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
114 |
self.add_source(source, 0) |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
115 |
|
116 |
def __dealloc__(self): |
|
117 |
self._ensure_no_indexes() |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
118 |
safe_free(<void **>&self._source_infos) |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
119 |
|
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
120 |
def add_source(self, source, unadded_bytes): |
121 |
"""Add a new bit of source text to the delta indexes.
|
|
122 |
||
123 |
:param source: The text in question, this must be a byte string
|
|
124 |
:param unadded_bytes: Assume there are this many bytes that didn't get
|
|
125 |
added between this source and the end of the previous source.
|
|
126 |
"""
|
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
127 |
cdef char *c_source |
128 |
cdef Py_ssize_t c_source_size |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
129 |
cdef delta_index *index |
0.23.42
by John Arbash Meinel
Change the code around again. |
130 |
cdef unsigned int source_location |
131 |
cdef source_info *src |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
132 |
cdef unsigned int num_indexes |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
133 |
|
134 |
if not PyString_CheckExact(source): |
|
135 |
raise TypeError('source is not a str') |
|
136 |
||
0.23.42
by John Arbash Meinel
Change the code around again. |
137 |
source_location = len(self._sources) |
138 |
if source_location >= self._max_num_sources: |
|
139 |
self._expand_sources() |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
140 |
self._sources.append(source) |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
141 |
c_source = PyString_AS_STRING(source) |
142 |
c_source_size = PyString_GET_SIZE(source) |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
143 |
src = self._source_infos + source_location |
144 |
src.buf = c_source |
|
145 |
src.size = c_source_size |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
146 |
|
147 |
# TODO: Are usage is ultimately going to be different than the one that
|
|
148 |
# was originally designed. Specifically, we are going to want to
|
|
149 |
# be able to update the index by hashing future data. It should
|
|
150 |
# fit just fine into the structure. But for now, we just wrap
|
|
151 |
# create_delta_index (For example, we could always reserve enough
|
|
152 |
# space to hash a 4MB string, etc.)
|
|
0.23.42
by John Arbash Meinel
Change the code around again. |
153 |
src.agg_offset = self._source_offset + unadded_bytes |
154 |
index = create_delta_index(src) |
|
155 |
self._source_offset = src.agg_offset + src.size |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
156 |
if index != NULL: |
157 |
num_indexes = self._num_indexes + 1 |
|
158 |
if num_indexes >= self._max_num_indexes: |
|
159 |
self._expand_indexes() |
|
160 |
self._indexes[self._num_indexes] = index |
|
161 |
self._num_indexes = num_indexes |
|
162 |
||
163 |
cdef _expand_indexes(self): |
|
164 |
self._max_num_indexes = self._max_num_indexes * 2 |
|
165 |
self._indexes = <delta_index **>safe_realloc(self._indexes, |
|
166 |
sizeof(delta_index *) |
|
167 |
* self._max_num_indexes) |
|
168 |
||
0.23.42
by John Arbash Meinel
Change the code around again. |
169 |
cdef _expand_sources(self): |
170 |
self._max_num_sources = self._max_num_sources * 2 |
|
171 |
self._source_infos = <source_info *>safe_realloc(self._source_infos, |
|
172 |
sizeof(source_info) |
|
173 |
* self._max_num_sources) |
|
174 |
||
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
175 |
cdef _ensure_no_indexes(self): |
176 |
cdef int i |
|
177 |
||
178 |
if self._indexes != NULL: |
|
179 |
for i from 0 <= i < self._num_indexes: |
|
180 |
free_delta_index(self._indexes[i]) |
|
181 |
self._indexes[i] = NULL |
|
182 |
free(self._indexes) |
|
183 |
self._indexes = NULL |
|
184 |
self._max_num_indexes = 0 |
|
185 |
self._num_indexes = 0 |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
186 |
|
187 |
def make_delta(self, target_bytes, max_delta_size=0): |
|
188 |
"""Create a delta from the current source to the target bytes."""
|
|
189 |
cdef char *target |
|
190 |
cdef Py_ssize_t target_size |
|
191 |
cdef void * delta |
|
192 |
cdef unsigned long delta_size |
|
193 |
||
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
194 |
if self._num_indexes == 0: |
0.23.15
by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig. |
195 |
return None |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
196 |
|
197 |
if not PyString_CheckExact(target_bytes): |
|
198 |
raise TypeError('target is not a str') |
|
199 |
||
200 |
target = PyString_AS_STRING(target_bytes) |
|
201 |
target_size = PyString_GET_SIZE(target_bytes) |
|
202 |
||
203 |
# TODO: inline some of create_delta so we at least don't have to double
|
|
204 |
# malloc, and can instead use PyString_FromStringAndSize, to
|
|
205 |
# allocate the bytes into the final string
|
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
206 |
delta = create_delta(self._indexes, self._num_indexes, |
207 |
target, target_size, |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
208 |
&delta_size, max_delta_size) |
209 |
result = None |
|
210 |
if delta: |
|
211 |
result = PyString_FromStringAndSize(<char *>delta, delta_size) |
|
212 |
free(delta) |
|
213 |
return result |
|
214 |
||
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
215 |
|
216 |
def make_delta(source_bytes, target_bytes): |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
217 |
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
|
218 |
di = DeltaIndex(source_bytes) |
|
219 |
return di.make_delta(target_bytes) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
220 |
|
221 |
||
222 |
def apply_delta(source_bytes, delta_bytes): |
|
223 |
"""Apply a delta generated by make_delta to source_bytes."""
|
|
224 |
cdef char *source |
|
225 |
cdef Py_ssize_t source_size |
|
226 |
cdef char *delta |
|
227 |
cdef Py_ssize_t delta_size |
|
228 |
cdef unsigned char *data, *top |
|
229 |
cdef unsigned char *dst_buf, *out, cmd |
|
230 |
cdef Py_ssize_t size |
|
231 |
cdef unsigned long cp_off, cp_size |
|
232 |
||
233 |
if not PyString_CheckExact(source_bytes): |
|
234 |
raise TypeError('source is not a str') |
|
235 |
if not PyString_CheckExact(delta_bytes): |
|
236 |
raise TypeError('delta is not a str') |
|
237 |
||
238 |
source = PyString_AS_STRING(source_bytes) |
|
239 |
source_size = PyString_GET_SIZE(source_bytes) |
|
240 |
delta = PyString_AS_STRING(delta_bytes) |
|
241 |
delta_size = PyString_GET_SIZE(delta_bytes) |
|
242 |
||
243 |
# Code taken from patch-delta.c, only brought here to give better error
|
|
244 |
# handling, and to avoid double allocating memory
|
|
245 |
if (delta_size < DELTA_SIZE_MIN): |
|
246 |
# XXX: Invalid delta block
|
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
247 |
raise RuntimeError('delta_size %d smaller than min delta size %d' |
248 |
% (delta_size, DELTA_SIZE_MIN)) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
249 |
|
250 |
data = <unsigned char *>delta |
|
251 |
top = data + delta_size |
|
252 |
||
253 |
# make sure the orig file size matches what we expect
|
|
254 |
# XXX: gcc warns because data isn't defined as 'const'
|
|
255 |
size = get_delta_hdr_size(&data, top) |
|
0.23.10
by John Arbash Meinel
Allowing the source bytes to be longer than expected. |
256 |
if (size > source_size): |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
257 |
# XXX: mismatched source size
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
258 |
raise RuntimeError('source size %d < expected source size %d' |
259 |
% (source_size, size)) |
|
0.23.10
by John Arbash Meinel
Allowing the source bytes to be longer than expected. |
260 |
source_size = size |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
261 |
|
262 |
# now the result size
|
|
263 |
size = get_delta_hdr_size(&data, top) |
|
264 |
result = PyString_FromStringAndSize(NULL, size) |
|
265 |
dst_buf = <unsigned char*>PyString_AS_STRING(result) |
|
266 |
# XXX: The original code added a trailing null here, but this shouldn't be
|
|
267 |
# necessary when using PyString_FromStringAndSize
|
|
268 |
# dst_buf[size] = 0
|
|
269 |
||
270 |
out = dst_buf |
|
271 |
while (data < top): |
|
272 |
cmd = data[0] |
|
273 |
data = data + 1 |
|
274 |
if (cmd & 0x80): |
|
275 |
cp_off = cp_size = 0 |
|
276 |
if (cmd & 0x01): |
|
277 |
cp_off = data[0] |
|
278 |
data = data + 1 |
|
279 |
if (cmd & 0x02): |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
280 |
cp_off = cp_off | (data[0] << 8) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
281 |
data = data + 1 |
282 |
if (cmd & 0x04): |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
283 |
cp_off = cp_off | (data[0] << 16) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
284 |
data = data + 1 |
285 |
if (cmd & 0x08): |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
286 |
cp_off = cp_off | (data[0] << 24) |
287 |
data = data + 1 |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
288 |
if (cmd & 0x10): |
289 |
cp_size = data[0] |
|
290 |
data = data + 1 |
|
291 |
if (cmd & 0x20): |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
292 |
cp_size = cp_size | (data[0] << 8) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
293 |
data = data + 1 |
294 |
if (cmd & 0x40): |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
295 |
cp_size = cp_size | (data[0] << 16) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
296 |
data = data + 1 |
297 |
if (cp_size == 0): |
|
298 |
cp_size = 0x10000 |
|
299 |
if (cp_off + cp_size < cp_size or |
|
300 |
cp_off + cp_size > source_size or |
|
301 |
cp_size > size): |
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
302 |
raise RuntimeError('Something wrong with:' |
303 |
' cp_off = %s, cp_size = %s' |
|
304 |
' source_size = %s, size = %s' |
|
305 |
% (cp_off, cp_size, source_size, size)) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
306 |
memcpy(out, source + cp_off, cp_size) |
307 |
out = out + cp_size |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
308 |
size = size - cp_size |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
309 |
elif (cmd): |
310 |
if (cmd > size): |
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
311 |
raise RuntimeError('Insert instruction longer than remaining' |
312 |
' bytes: %d > %d' % (cmd, size)) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
313 |
memcpy(out, data, cmd) |
314 |
out = out + cmd |
|
315 |
data = data + cmd |
|
0.24.1
by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4 |
316 |
size = size - cmd |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
317 |
else: |
318 |
# /*
|
|
319 |
# * cmd == 0 is reserved for future encoding
|
|
320 |
# * extensions. In the mean time we must fail when
|
|
321 |
# * encountering them (might be data corruption).
|
|
322 |
# */
|
|
323 |
## /* XXX: error("unexpected delta opcode 0"); */
|
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
324 |
raise RuntimeError('Got delta opcode: 0, not supported') |
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
325 |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
326 |
# /* sanity check */
|
327 |
if (data != top or size != 0): |
|
328 |
## /* XXX: error("delta replay has gone wild"); */
|
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
329 |
raise RuntimeError('Did not extract the number of bytes we expected' |
330 |
' we were left with %d bytes in "size", and top - data = %d' |
|
331 |
% (size, <int>(top - data))) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
332 |
return None |
333 |
||
334 |
# *dst_size = out - dst_buf;
|
|
335 |
assert (out - dst_buf) == PyString_GET_SIZE(result) |
|
336 |
return result |