4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
2 |
#
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
3 |
# This program is free software; you can redistribute it and/or modify
|
3735.36.4
by John Arbash Meinel
Fix the GPL and copyright statements in the pyrex files |
4 |
# it under the terms of the GNU General Public License as published by
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
7 |
#
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
8 |
# This program is distributed in the hope that it will be useful,
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
12 |
#
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
13 |
# You should have received a copy of the GNU General Public License
|
14 |
# along with this program; if not, write to the Free Software
|
|
3735.36.4
by John Arbash Meinel
Fix the GPL and copyright statements in the pyrex files |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
16 |
|
17 |
"""Compiled extensions for doing compression."""
|
|
18 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
19 |
#python2.4 support
|
20 |
cdef extern from "python-compat.h": |
|
4265.1.3
by John Arbash Meinel
restore the old Py_ssize_t import in the pyrex files. |
21 |
pass
|
22 |
||
23 |
||
24 |
cdef extern from "Python.h": |
|
5361.2.5
by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually. |
25 |
ctypedef struct PyObject: |
26 |
pass
|
|
4265.1.1
by John Arbash Meinel
Merge the a couple rev older brisbane-core into bzr.dev, most things are resolve in favor of bzr.dev |
27 |
ctypedef int Py_ssize_t # Required for older pyrex versions |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
28 |
int PyString_CheckExact(object) |
29 |
char * PyString_AS_STRING(object) |
|
30 |
Py_ssize_t PyString_GET_SIZE(object) |
|
31 |
object PyString_FromStringAndSize(char *, Py_ssize_t) |
|
32 |
||
33 |
||
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
34 |
cdef extern from *: |
35 |
ctypedef unsigned long size_t |
|
4788.2.2
by John Arbash Meinel
Stop holding the gil while extracting data. |
36 |
void * malloc(size_t) nogil |
37 |
void * realloc(void *, size_t) nogil |
|
38 |
void free(void *) nogil |
|
39 |
void memcpy(void *, void *, size_t) nogil |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
40 |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
41 |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
42 |
cdef extern from "delta.h": |
0.23.42
by John Arbash Meinel
Change the code around again. |
43 |
struct source_info: |
44 |
void *buf |
|
45 |
unsigned long size |
|
46 |
unsigned long agg_offset |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
47 |
struct delta_index: |
0.23.45
by John Arbash Meinel
Add a function that updates the index for delta bytes. |
48 |
pass
|
5698.2.10
by Martin
Add ctypedef marker which Cython may need though Pyrex does not |
49 |
ctypedef enum delta_result: |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
50 |
DELTA_OK
|
51 |
DELTA_OUT_OF_MEMORY
|
|
52 |
DELTA_INDEX_NEEDED
|
|
53 |
DELTA_SOURCE_EMPTY
|
|
54 |
DELTA_SOURCE_BAD
|
|
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
55 |
DELTA_BUFFER_EMPTY
|
56 |
DELTA_SIZE_TOO_BIG
|
|
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
57 |
delta_result create_delta_index(source_info *src, |
58 |
delta_index *old, |
|
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
59 |
delta_index **fresh, |
5755.2.10
by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further. |
60 |
int max_entries) nogil |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
61 |
delta_result create_delta_index_from_delta(source_info *delta, |
62 |
delta_index *old, |
|
63 |
delta_index **fresh) nogil |
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
64 |
void free_delta_index(delta_index *index) nogil |
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
65 |
delta_result create_delta(delta_index *indexes, |
66 |
void *buf, unsigned long bufsize, |
|
67 |
unsigned long *delta_size, |
|
68 |
unsigned long max_delta_size, |
|
69 |
void **delta_data) nogil |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
70 |
unsigned long get_delta_hdr_size(unsigned char **datap, |
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
71 |
unsigned char *top) nogil |
5361.2.3
by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex. |
72 |
unsigned long sizeof_delta_index(delta_index *index) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
73 |
Py_ssize_t DELTA_SIZE_MIN |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
74 |
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset) |
75 |
int get_entry_summary(delta_index *index, int pos, |
|
76 |
unsigned int *global_offset, unsigned int *hash_val) |
|
5755.2.10
by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further. |
77 |
unsigned int rabin_hash (unsigned char *data) |
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
78 |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
79 |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
80 |
cdef void *safe_malloc(size_t count) except NULL: |
81 |
cdef void *result |
|
82 |
result = malloc(count) |
|
83 |
if result == NULL: |
|
84 |
raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) |
|
85 |
return result |
|
86 |
||
87 |
||
88 |
cdef void *safe_realloc(void * old, size_t count) except NULL: |
|
89 |
cdef void *result |
|
90 |
result = realloc(old, count) |
|
91 |
if result == NULL: |
|
92 |
raise MemoryError('Failed to reallocate to %d bytes of memory' |
|
93 |
% (count,)) |
|
94 |
return result |
|
95 |
||
96 |
||
97 |
cdef int safe_free(void **val) except -1: |
|
98 |
assert val != NULL |
|
99 |
if val[0] != NULL: |
|
100 |
free(val[0]) |
|
101 |
val[0] = NULL |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
102 |
|
0.23.17
by John Arbash Meinel
Create a wrapper function, so that lsprof will properly attribute time spent. |
103 |
def make_delta_index(source): |
104 |
return DeltaIndex(source) |
|
105 |
||
106 |
||
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
107 |
cdef object _translate_delta_failure(delta_result result): |
108 |
if result == DELTA_OUT_OF_MEMORY: |
|
109 |
return MemoryError("Delta function failed to allocate memory") |
|
110 |
elif result == DELTA_INDEX_NEEDED: |
|
111 |
return ValueError("Delta function requires delta_index param") |
|
112 |
elif result == DELTA_SOURCE_EMPTY: |
|
113 |
return ValueError("Delta function given empty source_info param") |
|
114 |
elif result == DELTA_SOURCE_BAD: |
|
115 |
return RuntimeError("Delta function given invalid source_info param") |
|
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
116 |
elif result == DELTA_BUFFER_EMPTY: |
117 |
return ValueError("Delta function given empty buffer params") |
|
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
118 |
return AssertionError("Unrecognised delta result code: %d" % result) |
119 |
||
120 |
||
5755.2.8
by John Arbash Meinel
Do a lot of renaming. |
121 |
def _rabin_hash(content): |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
122 |
if not PyString_CheckExact(content): |
123 |
raise ValueError('content must be a string') |
|
124 |
if len(content) < 16: |
|
125 |
raise ValueError('content must be at least 16 bytes long') |
|
126 |
# Try to cast it to an int, if it can fit
|
|
5755.2.10
by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further. |
127 |
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content)))) |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
128 |
|
129 |
||
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
130 |
cdef class DeltaIndex: |
131 |
||
0.23.40
by John Arbash Meinel
Add a comment why we aren't using the list type for _sources |
132 |
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
|
133 |
# isn't performance critical
|
|
134 |
# cdef readonly list _sources
|
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
135 |
cdef readonly object _sources |
0.23.42
by John Arbash Meinel
Change the code around again. |
136 |
cdef source_info *_source_infos |
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
137 |
cdef delta_index *_index |
5361.2.5
by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually. |
138 |
cdef public unsigned long _source_offset |
0.23.42
by John Arbash Meinel
Change the code around again. |
139 |
cdef readonly unsigned int _max_num_sources |
5755.2.10
by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further. |
140 |
cdef public int _max_bytes_to_index |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
141 |
|
5755.2.8
by John Arbash Meinel
Do a lot of renaming. |
142 |
def __init__(self, source=None, max_bytes_to_index=None): |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
143 |
self._sources = [] |
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
144 |
self._index = NULL |
0.23.53
by John Arbash Meinel
Remove the temporary adjustment for handling multiple formats of labels. |
145 |
self._max_num_sources = 65000 |
0.23.42
by John Arbash Meinel
Change the code around again. |
146 |
self._source_infos = <source_info *>safe_malloc(sizeof(source_info) |
147 |
* self._max_num_sources) |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
148 |
self._source_offset = 0 |
5755.2.8
by John Arbash Meinel
Do a lot of renaming. |
149 |
self._max_bytes_to_index = 0 |
150 |
if max_bytes_to_index is not None: |
|
151 |
self._max_bytes_to_index = max_bytes_to_index |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
152 |
|
153 |
if source is not None: |
|
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
154 |
self.add_source(source, 0) |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
155 |
|
5361.2.3
by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex. |
156 |
def __sizeof__(self): |
157 |
# We want to track the _source_infos allocations, but the referenced
|
|
158 |
# void* are actually tracked in _sources itself.
|
|
5361.2.5
by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually. |
159 |
# XXX: Cython is capable of doing sizeof(class) and returning the size
|
160 |
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
|
|
161 |
# to do it manually. *sigh* Note that we might get it wrong
|
|
162 |
# because of alignment issues.
|
|
163 |
cdef Py_ssize_t size |
|
164 |
# PyObject start, vtable *, 3 object pointers, 2 C ints
|
|
165 |
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*) |
|
166 |
+ sizeof(unsigned long) |
|
167 |
+ sizeof(unsigned int)) |
|
168 |
+ (sizeof(source_info) * self._max_num_sources) |
|
169 |
+ sizeof_delta_index(self._index)) |
|
170 |
return size |
|
5361.2.3
by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex. |
171 |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
172 |
def __repr__(self): |
173 |
return '%s(%d, %d)' % (self.__class__.__name__, |
|
174 |
len(self._sources), self._source_offset) |
|
175 |
||
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
176 |
def __dealloc__(self): |
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
177 |
if self._index != NULL: |
178 |
free_delta_index(self._index) |
|
179 |
self._index = NULL |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
180 |
safe_free(<void **>&self._source_infos) |
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
181 |
|
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
182 |
def _has_index(self): |
183 |
return (self._index != NULL) |
|
184 |
||
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
185 |
def _dump_index(self): |
186 |
"""Dump the pointers in the index.
|
|
187 |
||
188 |
This is an arbitrary layout, used for testing. It is not meant to be
|
|
189 |
used in production code.
|
|
190 |
||
191 |
:return: (hash_list, entry_list)
|
|
192 |
hash_list A list of offsets, so hash[i] points to the 'hash
|
|
193 |
bucket' starting at the given offset and going until
|
|
194 |
hash[i+1]
|
|
195 |
entry_list A list of (text_offset, hash_val). text_offset is the
|
|
196 |
offset in the "source" texts, and hash_val is the RABIN
|
|
197 |
hash for that offset.
|
|
198 |
Note that the entry should be in the hash bucket
|
|
199 |
defined by
|
|
200 |
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
|
|
201 |
"""
|
|
202 |
cdef int pos |
|
203 |
cdef unsigned int text_offset |
|
204 |
cdef unsigned int hash_val |
|
205 |
cdef unsigned int hash_offset |
|
206 |
if self._index == NULL: |
|
207 |
return None |
|
208 |
hash_list = [] |
|
209 |
pos = 0 |
|
210 |
while get_hash_offset(self._index, pos, &hash_offset): |
|
211 |
hash_list.append(int(hash_offset)) |
|
212 |
pos += 1 |
|
213 |
entry_list = [] |
|
214 |
pos = 0 |
|
215 |
while get_entry_summary(self._index, pos, &text_offset, &hash_val): |
|
216 |
# Map back using 'int' so that we don't get Long everywhere, when
|
|
217 |
# almost everything is <2**31.
|
|
218 |
val = tuple(map(int, [text_offset, hash_val])) |
|
219 |
entry_list.append(val) |
|
220 |
pos += 1 |
|
221 |
return hash_list, entry_list |
|
222 |
||
0.23.45
by John Arbash Meinel
Add a function that updates the index for delta bytes. |
223 |
def add_delta_source(self, delta, unadded_bytes): |
224 |
"""Add a new delta to the source texts.
|
|
225 |
||
226 |
:param delta: The text of the delta, this must be a byte string.
|
|
227 |
:param unadded_bytes: Number of bytes that were added to the source
|
|
228 |
that were not indexed.
|
|
229 |
"""
|
|
230 |
cdef char *c_delta |
|
231 |
cdef Py_ssize_t c_delta_size |
|
232 |
cdef delta_index *index |
|
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
233 |
cdef delta_result res |
0.23.45
by John Arbash Meinel
Add a function that updates the index for delta bytes. |
234 |
cdef unsigned int source_location |
235 |
cdef source_info *src |
|
236 |
cdef unsigned int num_indexes |
|
237 |
||
238 |
if not PyString_CheckExact(delta): |
|
239 |
raise TypeError('delta is not a str') |
|
240 |
||
241 |
source_location = len(self._sources) |
|
242 |
if source_location >= self._max_num_sources: |
|
243 |
self._expand_sources() |
|
244 |
self._sources.append(delta) |
|
245 |
c_delta = PyString_AS_STRING(delta) |
|
246 |
c_delta_size = PyString_GET_SIZE(delta) |
|
247 |
src = self._source_infos + source_location |
|
248 |
src.buf = c_delta |
|
249 |
src.size = c_delta_size |
|
250 |
src.agg_offset = self._source_offset + unadded_bytes |
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
251 |
with nogil: |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
252 |
res = create_delta_index_from_delta(src, self._index, &index) |
253 |
if res != DELTA_OK: |
|
254 |
raise _translate_delta_failure(res) |
|
0.23.45
by John Arbash Meinel
Add a function that updates the index for delta bytes. |
255 |
self._source_offset = src.agg_offset + src.size |
5698.2.3
by Martin
Change create_delta_index_from_delta too so NULL can be treated as MemoryError |
256 |
if index != self._index: |
0.23.45
by John Arbash Meinel
Add a function that updates the index for delta bytes. |
257 |
free_delta_index(self._index) |
258 |
self._index = index |
|
259 |
||
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
260 |
def add_source(self, source, unadded_bytes): |
261 |
"""Add a new bit of source text to the delta indexes.
|
|
262 |
||
263 |
:param source: The text in question, this must be a byte string
|
|
264 |
:param unadded_bytes: Assume there are this many bytes that didn't get
|
|
265 |
added between this source and the end of the previous source.
|
|
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
266 |
:param max_pointers: Add no more than this many entries to the index.
|
267 |
By default, we sample every 16 bytes, if that would require more
|
|
268 |
than max_entries, we will reduce the sampling rate.
|
|
269 |
A value of 0 means unlimited, None means use the default limit.
|
|
0.23.26
by John Arbash Meinel
We now start to make use of the ability to extend the delta index |
270 |
"""
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
271 |
cdef char *c_source |
272 |
cdef Py_ssize_t c_source_size |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
273 |
cdef delta_index *index |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
274 |
cdef delta_result res |
0.23.42
by John Arbash Meinel
Change the code around again. |
275 |
cdef unsigned int source_location |
276 |
cdef source_info *src |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
277 |
cdef unsigned int num_indexes |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
278 |
cdef int max_num_entries |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
279 |
|
280 |
if not PyString_CheckExact(source): |
|
281 |
raise TypeError('source is not a str') |
|
282 |
||
0.23.42
by John Arbash Meinel
Change the code around again. |
283 |
source_location = len(self._sources) |
284 |
if source_location >= self._max_num_sources: |
|
285 |
self._expand_sources() |
|
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
286 |
if source_location != 0 and self._index == NULL: |
287 |
# We were lazy about populating the index, create it now
|
|
288 |
self._populate_first_index() |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
289 |
self._sources.append(source) |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
290 |
c_source = PyString_AS_STRING(source) |
291 |
c_source_size = PyString_GET_SIZE(source) |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
292 |
src = self._source_infos + source_location |
293 |
src.buf = c_source |
|
294 |
src.size = c_source_size |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
295 |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
296 |
src.agg_offset = self._source_offset + unadded_bytes |
297 |
self._source_offset = src.agg_offset + src.size |
|
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
298 |
# We delay creating the index on the first insert
|
299 |
if source_location != 0: |
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
300 |
with nogil: |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
301 |
res = create_delta_index(src, self._index, &index, |
5755.2.8
by John Arbash Meinel
Do a lot of renaming. |
302 |
self._max_bytes_to_index) |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
303 |
if res != DELTA_OK: |
304 |
raise _translate_delta_failure(res) |
|
5698.2.2
by Martin
Change create_delta_index signature so callers can treat NULL returns as MemoryError |
305 |
if index != self._index: |
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
306 |
free_delta_index(self._index) |
307 |
self._index = index |
|
308 |
||
309 |
cdef _populate_first_index(self): |
|
310 |
cdef delta_index *index |
|
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
311 |
cdef delta_result res |
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
312 |
if len(self._sources) != 1 or self._index != NULL: |
313 |
raise AssertionError('_populate_first_index should only be' |
|
314 |
' called when we have a single source and no index yet') |
|
315 |
||
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
316 |
# We know that self._index is already NULL, so create_delta_index
|
317 |
# will always create a new index unless there's a malloc failure
|
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
318 |
with nogil: |
5755.2.3
by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex |
319 |
res = create_delta_index(&self._source_infos[0], NULL, &index, |
5755.2.8
by John Arbash Meinel
Do a lot of renaming. |
320 |
self._max_bytes_to_index) |
5698.2.5
by Martin
Switch approach to delta function interfaces and use a return code and outparam |
321 |
if res != DELTA_OK: |
322 |
raise _translate_delta_failure(res) |
|
323 |
self._index = index |
|
0.23.25
by John Arbash Meinel
We are now able to add multiple sources to the delta generator. |
324 |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
325 |
cdef _expand_sources(self): |
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
326 |
raise RuntimeError('if we move self._source_infos, then we need to' |
327 |
' change all of the index pointers as well.') |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
328 |
self._max_num_sources = self._max_num_sources * 2 |
329 |
self._source_infos = <source_info *>safe_realloc(self._source_infos, |
|
330 |
sizeof(source_info) |
|
331 |
* self._max_num_sources) |
|
332 |
||
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
333 |
def make_delta(self, target_bytes, max_delta_size=0): |
334 |
"""Create a delta from the current source to the target bytes."""
|
|
335 |
cdef char *target |
|
336 |
cdef Py_ssize_t target_size |
|
337 |
cdef void * delta |
|
338 |
cdef unsigned long delta_size |
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
339 |
cdef unsigned long c_max_delta_size |
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
340 |
cdef delta_result res |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
341 |
|
0.23.43
by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data. |
342 |
if self._index == NULL: |
4398.6.1
by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source. |
343 |
if len(self._sources) == 0: |
344 |
return None |
|
345 |
# We were just lazy about generating the index
|
|
346 |
self._populate_first_index() |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
347 |
|
348 |
if not PyString_CheckExact(target_bytes): |
|
349 |
raise TypeError('target is not a str') |
|
350 |
||
351 |
target = PyString_AS_STRING(target_bytes) |
|
352 |
target_size = PyString_GET_SIZE(target_bytes) |
|
353 |
||
354 |
# TODO: inline some of create_delta so we at least don't have to double
|
|
355 |
# malloc, and can instead use PyString_FromStringAndSize, to
|
|
356 |
# allocate the bytes into the final string
|
|
4788.2.1
by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements. |
357 |
c_max_delta_size = max_delta_size |
358 |
with nogil: |
|
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
359 |
res = create_delta(self._index, target, target_size, |
360 |
&delta_size, c_max_delta_size, &delta) |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
361 |
result = None |
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
362 |
if res == DELTA_OK: |
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
363 |
result = PyString_FromStringAndSize(<char *>delta, delta_size) |
364 |
free(delta) |
|
5698.2.6
by Martin
Also adapt create_delta to the return code interface as it uses malloc |
365 |
elif res != DELTA_SIZE_TOO_BIG: |
366 |
raise _translate_delta_failure(res) |
|
0.23.14
by John Arbash Meinel
Implement a DeltaIndex wrapper. |
367 |
return result |
368 |
||
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
369 |
|
370 |
def make_delta(source_bytes, target_bytes): |
|
0.23.42
by John Arbash Meinel
Change the code around again. |
371 |
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
|
372 |
di = DeltaIndex(source_bytes) |
|
373 |
return di.make_delta(target_bytes) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
374 |
|
375 |
||
376 |
def apply_delta(source_bytes, delta_bytes): |
|
377 |
"""Apply a delta generated by make_delta to source_bytes."""
|
|
378 |
cdef char *source |
|
379 |
cdef Py_ssize_t source_size |
|
380 |
cdef char *delta |
|
381 |
cdef Py_ssize_t delta_size |
|
382 |
||
383 |
if not PyString_CheckExact(source_bytes): |
|
384 |
raise TypeError('source is not a str') |
|
385 |
if not PyString_CheckExact(delta_bytes): |
|
386 |
raise TypeError('delta is not a str') |
|
387 |
source = PyString_AS_STRING(source_bytes) |
|
388 |
source_size = PyString_GET_SIZE(source_bytes) |
|
389 |
delta = PyString_AS_STRING(delta_bytes) |
|
390 |
delta_size = PyString_GET_SIZE(delta_bytes) |
|
391 |
# Code taken from patch-delta.c, only brought here to give better error
|
|
392 |
# handling, and to avoid double allocating memory
|
|
393 |
if (delta_size < DELTA_SIZE_MIN): |
|
394 |
# XXX: Invalid delta block
|
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
395 |
raise RuntimeError('delta_size %d smaller than min delta size %d' |
396 |
% (delta_size, DELTA_SIZE_MIN)) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
397 |
|
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
398 |
return _apply_delta(source, source_size, delta, delta_size) |
399 |
||
400 |
||
3735.40.20
by John Arbash Meinel
cleanup the apply_delta code a bit. |
401 |
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes, |
4932.1.1
by John Arbash Meinel
Merge the 2.0 branch, resolve one conflict. |
402 |
unsigned char cmd, unsigned int *offset, |
403 |
unsigned int *length) nogil: # cannot_raise |
|
3735.40.20
by John Arbash Meinel
cleanup the apply_delta code a bit. |
404 |
"""Decode a copy instruction from the next few bytes.
|
405 |
||
406 |
A copy instruction is a variable number of bytes, so we will parse the
|
|
407 |
bytes we care about, and return the new position, as well as the offset and
|
|
408 |
length referred to in the bytes.
|
|
409 |
||
410 |
:param bytes: Pointer to the start of bytes after cmd
|
|
411 |
:param cmd: The command code
|
|
412 |
:return: Pointer to the bytes just after the last decode byte
|
|
413 |
"""
|
|
414 |
cdef unsigned int off, size, count |
|
415 |
off = 0 |
|
416 |
size = 0 |
|
417 |
count = 0 |
|
418 |
if (cmd & 0x01): |
|
419 |
off = bytes[count] |
|
420 |
count = count + 1 |
|
421 |
if (cmd & 0x02): |
|
422 |
off = off | (bytes[count] << 8) |
|
423 |
count = count + 1 |
|
424 |
if (cmd & 0x04): |
|
425 |
off = off | (bytes[count] << 16) |
|
426 |
count = count + 1 |
|
427 |
if (cmd & 0x08): |
|
428 |
off = off | (bytes[count] << 24) |
|
429 |
count = count + 1 |
|
430 |
if (cmd & 0x10): |
|
431 |
size = bytes[count] |
|
432 |
count = count + 1 |
|
433 |
if (cmd & 0x20): |
|
434 |
size = size | (bytes[count] << 8) |
|
435 |
count = count + 1 |
|
436 |
if (cmd & 0x40): |
|
437 |
size = size | (bytes[count] << 16) |
|
438 |
count = count + 1 |
|
439 |
if (size == 0): |
|
440 |
size = 0x10000 |
|
441 |
offset[0] = off |
|
442 |
length[0] = size |
|
443 |
return bytes + count |
|
444 |
||
445 |
||
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
446 |
cdef object _apply_delta(char *source, Py_ssize_t source_size, |
447 |
char *delta, Py_ssize_t delta_size): |
|
448 |
"""common functionality between apply_delta and apply_delta_to_source."""
|
|
449 |
cdef unsigned char *data, *top |
|
450 |
cdef unsigned char *dst_buf, *out, cmd |
|
451 |
cdef Py_ssize_t size |
|
3735.40.20
by John Arbash Meinel
cleanup the apply_delta code a bit. |
452 |
cdef unsigned int cp_off, cp_size |
4788.2.2
by John Arbash Meinel
Stop holding the gil while extracting data. |
453 |
cdef int failed |
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
454 |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
455 |
data = <unsigned char *>delta |
456 |
top = data + delta_size |
|
457 |
||
458 |
# now the result size
|
|
459 |
size = get_delta_hdr_size(&data, top) |
|
460 |
result = PyString_FromStringAndSize(NULL, size) |
|
461 |
dst_buf = <unsigned char*>PyString_AS_STRING(result) |
|
462 |
||
4788.2.2
by John Arbash Meinel
Stop holding the gil while extracting data. |
463 |
failed = 0 |
464 |
with nogil: |
|
465 |
out = dst_buf |
|
466 |
while (data < top): |
|
467 |
cmd = data[0] |
|
468 |
data = data + 1 |
|
469 |
if (cmd & 0x80): |
|
470 |
# Copy instruction
|
|
471 |
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size) |
|
472 |
if (cp_off + cp_size < cp_size or |
|
5698.2.4
by Martin
Make Py_ssize_t to uint downcast explict in _groupcompress_pyx to hush compiler warnings |
473 |
cp_off + cp_size > <unsigned int>source_size or |
474 |
cp_size > <unsigned int>size): |
|
4788.2.2
by John Arbash Meinel
Stop holding the gil while extracting data. |
475 |
failed = 1 |
476 |
break
|
|
477 |
memcpy(out, source + cp_off, cp_size) |
|
478 |
out = out + cp_size |
|
479 |
size = size - cp_size |
|
480 |
else: |
|
481 |
# Insert instruction
|
|
482 |
if cmd == 0: |
|
483 |
# cmd == 0 is reserved for future encoding
|
|
484 |
# extensions. In the mean time we must fail when
|
|
485 |
# encountering them (might be data corruption).
|
|
486 |
failed = 2 |
|
487 |
break
|
|
488 |
if cmd > size: |
|
489 |
failed = 3 |
|
490 |
break
|
|
491 |
memcpy(out, data, cmd) |
|
492 |
out = out + cmd |
|
493 |
data = data + cmd |
|
494 |
size = size - cmd |
|
495 |
if failed: |
|
496 |
if failed == 1: |
|
497 |
raise ValueError('Something wrong with:' |
|
498 |
' cp_off = %s, cp_size = %s' |
|
499 |
' source_size = %s, size = %s' |
|
500 |
% (cp_off, cp_size, source_size, size)) |
|
501 |
elif failed == 2: |
|
502 |
raise ValueError('Got delta opcode: 0, not supported') |
|
503 |
elif failed == 3: |
|
504 |
raise ValueError('Insert instruction longer than remaining' |
|
505 |
' bytes: %d > %d' % (cmd, size)) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
506 |
|
3735.40.20
by John Arbash Meinel
cleanup the apply_delta code a bit. |
507 |
# sanity check
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
508 |
if (data != top or size != 0): |
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
509 |
raise RuntimeError('Did not extract the number of bytes we expected' |
510 |
' we were left with %d bytes in "size", and top - data = %d' |
|
511 |
% (size, <int>(top - data))) |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
512 |
return None |
513 |
||
514 |
# *dst_size = out - dst_buf;
|
|
3735.40.20
by John Arbash Meinel
cleanup the apply_delta code a bit. |
515 |
if (out - dst_buf) != PyString_GET_SIZE(result): |
516 |
raise RuntimeError('Number of bytes extracted did not match the' |
|
517 |
' size encoded in the delta header.') |
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
518 |
return result |
3735.40.16
by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex. |
519 |
|
520 |
||
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
521 |
def apply_delta_to_source(source, delta_start, delta_end): |
522 |
"""Extract a delta from source bytes, and apply it."""
|
|
523 |
cdef char *c_source |
|
524 |
cdef Py_ssize_t c_source_size |
|
525 |
cdef char *c_delta |
|
526 |
cdef Py_ssize_t c_delta_size |
|
527 |
cdef Py_ssize_t c_delta_start, c_delta_end |
|
528 |
||
529 |
if not PyString_CheckExact(source): |
|
530 |
raise TypeError('source is not a str') |
|
531 |
c_source_size = PyString_GET_SIZE(source) |
|
532 |
c_delta_start = delta_start |
|
533 |
c_delta_end = delta_end |
|
534 |
if c_delta_start >= c_source_size: |
|
535 |
raise ValueError('delta starts after source') |
|
536 |
if c_delta_end > c_source_size: |
|
537 |
raise ValueError('delta ends after source') |
|
538 |
if c_delta_start >= c_delta_end: |
|
539 |
raise ValueError('delta starts after it ends') |
|
540 |
||
541 |
c_delta_size = c_delta_end - c_delta_start |
|
542 |
c_source = PyString_AS_STRING(source) |
|
543 |
c_delta = c_source + c_delta_start |
|
544 |
# We don't use source_size, because we know the delta should not refer to
|
|
545 |
# any bytes after it starts
|
|
546 |
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size) |
|
547 |
||
548 |
||
3735.40.16
by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex. |
549 |
def encode_base128_int(val): |
550 |
"""Convert an integer into a 7-bit lsb encoding."""
|
|
551 |
cdef unsigned int c_val |
|
552 |
cdef Py_ssize_t count |
|
553 |
cdef unsigned int num_bytes |
|
554 |
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes |
|
555 |
||
556 |
c_val = val |
|
557 |
count = 0 |
|
558 |
while c_val >= 0x80 and count < 8: |
|
559 |
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF) |
|
560 |
c_val = c_val >> 7 |
|
561 |
count = count + 1 |
|
562 |
if count >= 8 or c_val >= 0x80: |
|
563 |
raise ValueError('encode_base128_int overflowed the buffer') |
|
564 |
c_bytes[count] = <unsigned char>(c_val & 0xFF) |
|
565 |
count = count + 1 |
|
566 |
return PyString_FromStringAndSize(<char *>c_bytes, count) |
|
567 |
||
568 |
||
569 |
def decode_base128_int(bytes): |
|
570 |
"""Decode an integer from a 7-bit lsb encoding."""
|
|
571 |
cdef int offset |
|
572 |
cdef int val |
|
573 |
cdef unsigned int uval |
|
574 |
cdef int shift |
|
575 |
cdef Py_ssize_t num_low_bytes |
|
576 |
cdef unsigned char *c_bytes |
|
577 |
||
578 |
offset = 0 |
|
579 |
val = 0 |
|
580 |
shift = 0 |
|
581 |
if not PyString_CheckExact(bytes): |
|
582 |
raise TypeError('bytes is not a string') |
|
583 |
c_bytes = <unsigned char*>PyString_AS_STRING(bytes) |
|
584 |
# We take off 1, because we have to be able to decode the non-expanded byte
|
|
585 |
num_low_bytes = PyString_GET_SIZE(bytes) - 1 |
|
586 |
while (c_bytes[offset] & 0x80) and offset < num_low_bytes: |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
587 |
val = val | ((c_bytes[offset] & 0x7F) << shift) |
3735.40.16
by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex. |
588 |
shift = shift + 7 |
589 |
offset = offset + 1 |
|
590 |
if c_bytes[offset] & 0x80: |
|
591 |
raise ValueError('Data not properly formatted, we ran out of' |
|
592 |
' bytes before 0x80 stopped being set.') |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
593 |
val = val | (c_bytes[offset] << shift) |
3735.40.16
by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex. |
594 |
offset = offset + 1 |
595 |
if val < 0: |
|
596 |
uval = <unsigned int> val |
|
597 |
return uval, offset |
|
598 |
return val, offset |
|
599 |
||
600 |