~bzr-pqm/bzr/bzr.dev

4763.2.4 by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry.
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
2
#
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
3
# This program is free software; you can redistribute it and/or modify
3735.36.4 by John Arbash Meinel
Fix the GPL and copyright statements in the pyrex files
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
7
#
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
12
#
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
3735.36.4 by John Arbash Meinel
Fix the GPL and copyright statements in the pyrex files
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
16
17
"""Compiled extensions for doing compression."""
18
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
19
#python2.4 support
20
cdef extern from "python-compat.h":
4265.1.3 by John Arbash Meinel
restore the old Py_ssize_t import in the pyrex files.
21
    pass
22
23
24
cdef extern from "Python.h":
5361.2.5 by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually.
25
    ctypedef struct PyObject:
26
        pass
4265.1.1 by John Arbash Meinel
Merge the a couple rev older brisbane-core into bzr.dev, most things are resolve in favor of bzr.dev
27
    ctypedef int Py_ssize_t # Required for older pyrex versions
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
28
    int PyString_CheckExact(object)
29
    char * PyString_AS_STRING(object)
30
    Py_ssize_t PyString_GET_SIZE(object)
31
    object PyString_FromStringAndSize(char *, Py_ssize_t)
32
33
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
34
cdef extern from *:
35
    ctypedef unsigned long size_t
4788.2.2 by John Arbash Meinel
Stop holding the gil while extracting data.
36
    void * malloc(size_t) nogil
37
    void * realloc(void *, size_t) nogil
38
    void free(void *) nogil
39
    void memcpy(void *, void *, size_t) nogil
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
40
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
41
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
42
cdef extern from "delta.h":
0.23.42 by John Arbash Meinel
Change the code around again.
43
    struct source_info:
44
        void *buf
45
        unsigned long size
46
        unsigned long agg_offset
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
47
    struct delta_index:
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
48
        pass
5698.2.10 by Martin
Add ctypedef marker which Cython may need though Pyrex does not
49
    ctypedef enum delta_result:
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
50
        DELTA_OK
51
        DELTA_OUT_OF_MEMORY
52
        DELTA_INDEX_NEEDED
53
        DELTA_SOURCE_EMPTY
54
        DELTA_SOURCE_BAD
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
55
        DELTA_BUFFER_EMPTY
56
        DELTA_SIZE_TOO_BIG
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
57
    delta_result create_delta_index(source_info *src,
58
                                    delta_index *old,
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
59
                                    delta_index **fresh,
5755.2.10 by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further.
60
                                    int max_entries) nogil
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
61
    delta_result create_delta_index_from_delta(source_info *delta,
62
                                               delta_index *old,
63
                                               delta_index **fresh) nogil
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
64
    void free_delta_index(delta_index *index) nogil
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
65
    delta_result create_delta(delta_index *indexes,
66
                              void *buf, unsigned long bufsize,
67
                              unsigned long *delta_size,
68
                              unsigned long max_delta_size,
69
                              void **delta_data) nogil
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
70
    unsigned long get_delta_hdr_size(unsigned char **datap,
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
71
                                     unsigned char *top) nogil
5361.2.3 by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex.
72
    unsigned long sizeof_delta_index(delta_index *index)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
73
    Py_ssize_t DELTA_SIZE_MIN
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
74
    int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
75
    int get_entry_summary(delta_index *index, int pos,
76
                          unsigned int *global_offset, unsigned int *hash_val)
5755.2.10 by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further.
77
    unsigned int rabin_hash (unsigned char *data)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
78
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
79
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
80
cdef void *safe_malloc(size_t count) except NULL:
81
    cdef void *result
82
    result = malloc(count)
83
    if result == NULL:
84
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
85
    return result
86
87
88
cdef void *safe_realloc(void * old, size_t count) except NULL:
89
    cdef void *result
90
    result = realloc(old, count)
91
    if result == NULL:
92
        raise MemoryError('Failed to reallocate to %d bytes of memory'
93
                          % (count,))
94
    return result
95
96
97
cdef int safe_free(void **val) except -1:
98
    assert val != NULL
99
    if val[0] != NULL:
100
        free(val[0])
101
        val[0] = NULL
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
102
0.23.17 by John Arbash Meinel
Create a wrapper function, so that lsprof will properly attribute time spent.
103
def make_delta_index(source):
104
    return DeltaIndex(source)
105
106
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
107
cdef object _translate_delta_failure(delta_result result):
108
    if result == DELTA_OUT_OF_MEMORY:
109
        return MemoryError("Delta function failed to allocate memory")
110
    elif result == DELTA_INDEX_NEEDED:
111
        return ValueError("Delta function requires delta_index param")
112
    elif result == DELTA_SOURCE_EMPTY:
113
        return ValueError("Delta function given empty source_info param")
114
    elif result == DELTA_SOURCE_BAD:
115
        return RuntimeError("Delta function given invalid source_info param")
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
116
    elif result == DELTA_BUFFER_EMPTY:
117
        return ValueError("Delta function given empty buffer params")
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
118
    return AssertionError("Unrecognised delta result code: %d" % result)
119
120
5755.2.8 by John Arbash Meinel
Do a lot of renaming.
121
def _rabin_hash(content):
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
122
    if not PyString_CheckExact(content):
123
        raise ValueError('content must be a string')
124
    if len(content) < 16:
125
        raise ValueError('content must be at least 16 bytes long')
126
    # Try to cast it to an int, if it can fit
5755.2.10 by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further.
127
    return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
128
129
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
130
cdef class DeltaIndex:
131
0.23.40 by John Arbash Meinel
Add a comment why we aren't using the list type for _sources
132
    # We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
133
    # isn't performance critical
134
    # cdef readonly list _sources
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
135
    cdef readonly object _sources
0.23.42 by John Arbash Meinel
Change the code around again.
136
    cdef source_info *_source_infos
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
137
    cdef delta_index *_index
5361.2.5 by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually.
138
    cdef public unsigned long _source_offset
0.23.42 by John Arbash Meinel
Change the code around again.
139
    cdef readonly unsigned int _max_num_sources
5755.2.10 by John Arbash Meinel
Merge Martin gz's tweaks for signed vs unsigned, but tweak them a bit further.
140
    cdef public int _max_bytes_to_index
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
141
5755.2.8 by John Arbash Meinel
Do a lot of renaming.
142
    def __init__(self, source=None, max_bytes_to_index=None):
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
143
        self._sources = []
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
144
        self._index = NULL
0.23.53 by John Arbash Meinel
Remove the temporary adjustment for handling multiple formats of labels.
145
        self._max_num_sources = 65000
0.23.42 by John Arbash Meinel
Change the code around again.
146
        self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
147
                                                        * self._max_num_sources)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
148
        self._source_offset = 0
5755.2.8 by John Arbash Meinel
Do a lot of renaming.
149
        self._max_bytes_to_index = 0
150
        if max_bytes_to_index is not None:
151
            self._max_bytes_to_index = max_bytes_to_index
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
152
153
        if source is not None:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
154
            self.add_source(source, 0)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
155
5361.2.3 by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex.
156
    def __sizeof__(self):
157
        # We want to track the _source_infos allocations, but the referenced
158
        # void* are actually tracked in _sources itself.
5361.2.5 by John Arbash Meinel
Pyrex doesn't allow sizeof(class), so we have to unroll it manually.
159
        # XXX: Cython is capable of doing sizeof(class) and returning the size
160
        #      of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
161
        #      to do it manually. *sigh* Note that we might get it wrong
162
        #      because of alignment issues.
163
        cdef Py_ssize_t size
164
        # PyObject start, vtable *, 3 object pointers, 2 C ints
165
        size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
166
                 + sizeof(unsigned long)
167
                 + sizeof(unsigned int))
168
                + (sizeof(source_info) * self._max_num_sources)
169
                + sizeof_delta_index(self._index))
170
        return size
5361.2.3 by John Arbash Meinel
Add a __sizeof__ member for DeltaIndex.
171
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
172
    def __repr__(self):
173
        return '%s(%d, %d)' % (self.__class__.__name__,
174
            len(self._sources), self._source_offset)
175
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
176
    def __dealloc__(self):
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
177
        if self._index != NULL:
178
            free_delta_index(self._index)
179
            self._index = NULL
0.23.42 by John Arbash Meinel
Change the code around again.
180
        safe_free(<void **>&self._source_infos)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
181
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
182
    def _has_index(self):
183
        return (self._index != NULL)
184
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
185
    def _dump_index(self):
186
        """Dump the pointers in the index.
187
188
        This is an arbitrary layout, used for testing. It is not meant to be
189
        used in production code.
190
191
        :return: (hash_list, entry_list)
192
            hash_list   A list of offsets, so hash[i] points to the 'hash
193
                        bucket' starting at the given offset and going until
194
                        hash[i+1]
195
            entry_list  A list of (text_offset, hash_val). text_offset is the
196
                        offset in the "source" texts, and hash_val is the RABIN
197
                        hash for that offset.
198
                        Note that the entry should be in the hash bucket
199
                        defined by
200
                        hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
201
        """
202
        cdef int pos
203
        cdef unsigned int text_offset
204
        cdef unsigned int hash_val
205
        cdef unsigned int hash_offset
206
        if self._index == NULL:
207
            return None
208
        hash_list = []
209
        pos = 0
210
        while get_hash_offset(self._index, pos, &hash_offset):
211
            hash_list.append(int(hash_offset))
212
            pos += 1
213
        entry_list = []
214
        pos = 0
215
        while get_entry_summary(self._index, pos, &text_offset, &hash_val):
216
            # Map back using 'int' so that we don't get Long everywhere, when
217
            # almost everything is <2**31.
218
            val = tuple(map(int, [text_offset, hash_val]))
219
            entry_list.append(val)
220
            pos += 1
221
        return hash_list, entry_list
222
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
223
    def add_delta_source(self, delta, unadded_bytes):
224
        """Add a new delta to the source texts.
225
226
        :param delta: The text of the delta, this must be a byte string.
227
        :param unadded_bytes: Number of bytes that were added to the source
228
            that were not indexed.
229
        """
230
        cdef char *c_delta
231
        cdef Py_ssize_t c_delta_size
232
        cdef delta_index *index
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
233
        cdef delta_result res
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
234
        cdef unsigned int source_location
235
        cdef source_info *src
236
        cdef unsigned int num_indexes
237
238
        if not PyString_CheckExact(delta):
239
            raise TypeError('delta is not a str')
240
241
        source_location = len(self._sources)
242
        if source_location >= self._max_num_sources:
243
            self._expand_sources()
244
        self._sources.append(delta)
245
        c_delta = PyString_AS_STRING(delta)
246
        c_delta_size = PyString_GET_SIZE(delta)
247
        src = self._source_infos + source_location
248
        src.buf = c_delta
249
        src.size = c_delta_size
250
        src.agg_offset = self._source_offset + unadded_bytes
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
251
        with nogil:
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
252
            res = create_delta_index_from_delta(src, self._index, &index)
253
        if res != DELTA_OK:
254
            raise _translate_delta_failure(res)
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
255
        self._source_offset = src.agg_offset + src.size
5698.2.3 by Martin
Change create_delta_index_from_delta too so NULL can be treated as MemoryError
256
        if index != self._index:
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
257
            free_delta_index(self._index)
258
            self._index = index
259
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
260
    def add_source(self, source, unadded_bytes):
261
        """Add a new bit of source text to the delta indexes.
262
263
        :param source: The text in question, this must be a byte string
264
        :param unadded_bytes: Assume there are this many bytes that didn't get
265
            added between this source and the end of the previous source.
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
266
        :param max_pointers: Add no more than this many entries to the index.
267
            By default, we sample every 16 bytes, if that would require more
268
            than max_entries, we will reduce the sampling rate.
269
            A value of 0 means unlimited, None means use the default limit.
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
270
        """
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
271
        cdef char *c_source
272
        cdef Py_ssize_t c_source_size
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
273
        cdef delta_index *index
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
274
        cdef delta_result res
0.23.42 by John Arbash Meinel
Change the code around again.
275
        cdef unsigned int source_location
276
        cdef source_info *src
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
277
        cdef unsigned int num_indexes
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
278
        cdef int max_num_entries
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
279
280
        if not PyString_CheckExact(source):
281
            raise TypeError('source is not a str')
282
0.23.42 by John Arbash Meinel
Change the code around again.
283
        source_location = len(self._sources)
284
        if source_location >= self._max_num_sources:
285
            self._expand_sources()
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
286
        if source_location != 0 and self._index == NULL:
287
            # We were lazy about populating the index, create it now
288
            self._populate_first_index()
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
289
        self._sources.append(source)
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
290
        c_source = PyString_AS_STRING(source)
291
        c_source_size = PyString_GET_SIZE(source)
0.23.42 by John Arbash Meinel
Change the code around again.
292
        src = self._source_infos + source_location
293
        src.buf = c_source
294
        src.size = c_source_size
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
295
0.23.42 by John Arbash Meinel
Change the code around again.
296
        src.agg_offset = self._source_offset + unadded_bytes
297
        self._source_offset = src.agg_offset + src.size
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
298
        # We delay creating the index on the first insert
299
        if source_location != 0:
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
300
            with nogil:
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
301
                res = create_delta_index(src, self._index, &index,
5755.2.8 by John Arbash Meinel
Do a lot of renaming.
302
                                         self._max_bytes_to_index)
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
303
            if res != DELTA_OK:
304
                raise _translate_delta_failure(res)
5698.2.2 by Martin
Change create_delta_index signature so callers can treat NULL returns as MemoryError
305
            if index != self._index:
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
306
                free_delta_index(self._index)
307
                self._index = index
308
309
    cdef _populate_first_index(self):
310
        cdef delta_index *index
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
311
        cdef delta_result res
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
312
        if len(self._sources) != 1 or self._index != NULL:
313
            raise AssertionError('_populate_first_index should only be'
314
                ' called when we have a single source and no index yet')
315
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
316
        # We know that self._index is already NULL, so create_delta_index
317
        # will always create a new index unless there's a malloc failure
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
318
        with nogil:
5755.2.3 by John Arbash Meinel
Add a max_entries_per_source to DeltaIndex
319
            res = create_delta_index(&self._source_infos[0], NULL, &index,
5755.2.8 by John Arbash Meinel
Do a lot of renaming.
320
                                     self._max_bytes_to_index)
5698.2.5 by Martin
Switch approach to delta function interfaces and use a return code and outparam
321
        if res != DELTA_OK:
322
            raise _translate_delta_failure(res)
323
        self._index = index
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
324
0.23.42 by John Arbash Meinel
Change the code around again.
325
    cdef _expand_sources(self):
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
326
        raise RuntimeError('if we move self._source_infos, then we need to'
327
                           ' change all of the index pointers as well.')
0.23.42 by John Arbash Meinel
Change the code around again.
328
        self._max_num_sources = self._max_num_sources * 2
329
        self._source_infos = <source_info *>safe_realloc(self._source_infos,
330
                                                sizeof(source_info)
331
                                                * self._max_num_sources)
332
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
333
    def make_delta(self, target_bytes, max_delta_size=0):
334
        """Create a delta from the current source to the target bytes."""
335
        cdef char *target
336
        cdef Py_ssize_t target_size
337
        cdef void * delta
338
        cdef unsigned long delta_size
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
339
        cdef unsigned long c_max_delta_size
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
340
        cdef delta_result res
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
341
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
342
        if self._index == NULL:
4398.6.1 by John Arbash Meinel
Change groupcompress.DeltaIndex to be lazy about indexing the first source.
343
            if len(self._sources) == 0:
344
                return None
345
            # We were just lazy about generating the index
346
            self._populate_first_index()
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
347
348
        if not PyString_CheckExact(target_bytes):
349
            raise TypeError('target is not a str')
350
351
        target = PyString_AS_STRING(target_bytes)
352
        target_size = PyString_GET_SIZE(target_bytes)
353
354
        # TODO: inline some of create_delta so we at least don't have to double
355
        #       malloc, and can instead use PyString_FromStringAndSize, to
356
        #       allocate the bytes into the final string
4788.2.1 by John Arbash Meinel
Wrap the core groupcompress.create_delta calls with 'with nogil' statements.
357
        c_max_delta_size = max_delta_size
358
        with nogil:
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
359
            res = create_delta(self._index, target, target_size,
360
                               &delta_size, c_max_delta_size, &delta)
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
361
        result = None
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
362
        if res == DELTA_OK:
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
363
            result = PyString_FromStringAndSize(<char *>delta, delta_size)
364
            free(delta)
5698.2.6 by Martin
Also adapt create_delta to the return code interface as it uses malloc
365
        elif res != DELTA_SIZE_TOO_BIG:
366
            raise _translate_delta_failure(res)
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
367
        return result
368
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
369
370
def make_delta(source_bytes, target_bytes):
0.23.42 by John Arbash Meinel
Change the code around again.
371
    """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
372
    di = DeltaIndex(source_bytes)
373
    return di.make_delta(target_bytes)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
374
375
376
def apply_delta(source_bytes, delta_bytes):
377
    """Apply a delta generated by make_delta to source_bytes."""
378
    cdef char *source
379
    cdef Py_ssize_t source_size
380
    cdef char *delta
381
    cdef Py_ssize_t delta_size
382
383
    if not PyString_CheckExact(source_bytes):
384
        raise TypeError('source is not a str')
385
    if not PyString_CheckExact(delta_bytes):
386
        raise TypeError('delta is not a str')
387
    source = PyString_AS_STRING(source_bytes)
388
    source_size = PyString_GET_SIZE(source_bytes)
389
    delta = PyString_AS_STRING(delta_bytes)
390
    delta_size = PyString_GET_SIZE(delta_bytes)
391
    # Code taken from patch-delta.c, only brought here to give better error
392
    # handling, and to avoid double allocating memory
393
    if (delta_size < DELTA_SIZE_MIN):
394
        # XXX: Invalid delta block
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
395
        raise RuntimeError('delta_size %d smaller than min delta size %d'
396
                           % (delta_size, DELTA_SIZE_MIN))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
397
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
398
    return _apply_delta(source, source_size, delta, delta_size)
399
400
3735.40.20 by John Arbash Meinel
cleanup the apply_delta code a bit.
401
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
4932.1.1 by John Arbash Meinel
Merge the 2.0 branch, resolve one conflict.
402
    unsigned char cmd, unsigned int *offset,
403
    unsigned int *length) nogil: # cannot_raise
3735.40.20 by John Arbash Meinel
cleanup the apply_delta code a bit.
404
    """Decode a copy instruction from the next few bytes.
405
406
    A copy instruction is a variable number of bytes, so we will parse the
407
    bytes we care about, and return the new position, as well as the offset and
408
    length referred to in the bytes.
409
410
    :param bytes: Pointer to the start of bytes after cmd
411
    :param cmd: The command code
412
    :return: Pointer to the bytes just after the last decode byte
413
    """
414
    cdef unsigned int off, size, count
415
    off = 0
416
    size = 0
417
    count = 0
418
    if (cmd & 0x01):
419
        off = bytes[count]
420
        count = count + 1
421
    if (cmd & 0x02):
422
        off = off | (bytes[count] << 8)
423
        count = count + 1
424
    if (cmd & 0x04):
425
        off = off | (bytes[count] << 16)
426
        count = count + 1
427
    if (cmd & 0x08):
428
        off = off | (bytes[count] << 24)
429
        count = count + 1
430
    if (cmd & 0x10):
431
        size = bytes[count]
432
        count = count + 1
433
    if (cmd & 0x20):
434
        size = size | (bytes[count] << 8)
435
        count = count + 1
436
    if (cmd & 0x40):
437
        size = size | (bytes[count] << 16)
438
        count = count + 1
439
    if (size == 0):
440
        size = 0x10000
441
    offset[0] = off
442
    length[0] = size
443
    return bytes + count
444
445
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
446
cdef object _apply_delta(char *source, Py_ssize_t source_size,
447
                         char *delta, Py_ssize_t delta_size):
448
    """common functionality between apply_delta and apply_delta_to_source."""
449
    cdef unsigned char *data, *top
450
    cdef unsigned char *dst_buf, *out, cmd
451
    cdef Py_ssize_t size
3735.40.20 by John Arbash Meinel
cleanup the apply_delta code a bit.
452
    cdef unsigned int cp_off, cp_size
4788.2.2 by John Arbash Meinel
Stop holding the gil while extracting data.
453
    cdef int failed
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
454
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
455
    data = <unsigned char *>delta
456
    top = data + delta_size
457
458
    # now the result size
459
    size = get_delta_hdr_size(&data, top)
460
    result = PyString_FromStringAndSize(NULL, size)
461
    dst_buf = <unsigned char*>PyString_AS_STRING(result)
462
4788.2.2 by John Arbash Meinel
Stop holding the gil while extracting data.
463
    failed = 0
464
    with nogil:
465
        out = dst_buf
466
        while (data < top):
467
            cmd = data[0]
468
            data = data + 1
469
            if (cmd & 0x80):
470
                # Copy instruction
471
                data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
472
                if (cp_off + cp_size < cp_size or
5698.2.4 by Martin
Make Py_ssize_t to uint downcast explict in _groupcompress_pyx to hush compiler warnings
473
                    cp_off + cp_size > <unsigned int>source_size or
474
                    cp_size > <unsigned int>size):
4788.2.2 by John Arbash Meinel
Stop holding the gil while extracting data.
475
                    failed = 1
476
                    break
477
                memcpy(out, source + cp_off, cp_size)
478
                out = out + cp_size
479
                size = size - cp_size
480
            else:
481
                # Insert instruction
482
                if cmd == 0:
483
                    # cmd == 0 is reserved for future encoding
484
                    # extensions. In the mean time we must fail when
485
                    # encountering them (might be data corruption).
486
                    failed = 2
487
                    break
488
                if cmd > size:
489
                    failed = 3
490
                    break
491
                memcpy(out, data, cmd)
492
                out = out + cmd
493
                data = data + cmd
494
                size = size - cmd
495
    if failed:
496
        if failed == 1:
497
            raise ValueError('Something wrong with:'
498
                ' cp_off = %s, cp_size = %s'
499
                ' source_size = %s, size = %s'
500
                % (cp_off, cp_size, source_size, size))
501
        elif failed == 2:
502
            raise ValueError('Got delta opcode: 0, not supported')
503
        elif failed == 3:
504
            raise ValueError('Insert instruction longer than remaining'
505
                ' bytes: %d > %d' % (cmd, size))
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
506
3735.40.20 by John Arbash Meinel
cleanup the apply_delta code a bit.
507
    # sanity check
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
508
    if (data != top or size != 0):
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
509
        raise RuntimeError('Did not extract the number of bytes we expected'
510
            ' we were left with %d bytes in "size", and top - data = %d'
511
            % (size, <int>(top - data)))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
512
        return None
513
514
    # *dst_size = out - dst_buf;
3735.40.20 by John Arbash Meinel
cleanup the apply_delta code a bit.
515
    if (out - dst_buf) != PyString_GET_SIZE(result):
516
        raise RuntimeError('Number of bytes extracted did not match the'
517
            ' size encoded in the delta header.')
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
518
    return result
3735.40.16 by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex.
519
520
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
521
def apply_delta_to_source(source, delta_start, delta_end):
522
    """Extract a delta from source bytes, and apply it."""
523
    cdef char *c_source
524
    cdef Py_ssize_t c_source_size
525
    cdef char *c_delta
526
    cdef Py_ssize_t c_delta_size
527
    cdef Py_ssize_t c_delta_start, c_delta_end
528
529
    if not PyString_CheckExact(source):
530
        raise TypeError('source is not a str')
531
    c_source_size = PyString_GET_SIZE(source)
532
    c_delta_start = delta_start
533
    c_delta_end = delta_end
534
    if c_delta_start >= c_source_size:
535
        raise ValueError('delta starts after source')
536
    if c_delta_end > c_source_size:
537
        raise ValueError('delta ends after source')
538
    if c_delta_start >= c_delta_end:
539
        raise ValueError('delta starts after it ends')
540
541
    c_delta_size = c_delta_end - c_delta_start
542
    c_source = PyString_AS_STRING(source)
543
    c_delta = c_source + c_delta_start
544
    # We don't use source_size, because we know the delta should not refer to
545
    # any bytes after it starts
546
    return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
547
548
3735.40.16 by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex.
549
def encode_base128_int(val):
550
    """Convert an integer into a 7-bit lsb encoding."""
551
    cdef unsigned int c_val
552
    cdef Py_ssize_t count
553
    cdef unsigned int num_bytes
554
    cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
555
556
    c_val = val
557
    count = 0
558
    while c_val >= 0x80 and count < 8:
559
        c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
560
        c_val = c_val >> 7
561
        count = count + 1
562
    if count >= 8 or c_val >= 0x80:
563
        raise ValueError('encode_base128_int overflowed the buffer')
564
    c_bytes[count] = <unsigned char>(c_val & 0xFF)
565
    count = count + 1
566
    return PyString_FromStringAndSize(<char *>c_bytes, count)
567
568
569
def decode_base128_int(bytes):
570
    """Decode an integer from a 7-bit lsb encoding."""
571
    cdef int offset
572
    cdef int val
573
    cdef unsigned int uval
574
    cdef int shift
575
    cdef Py_ssize_t num_low_bytes
576
    cdef unsigned char *c_bytes
577
578
    offset = 0
579
    val = 0
580
    shift = 0
581
    if not PyString_CheckExact(bytes):
582
        raise TypeError('bytes is not a string')
583
    c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
584
    # We take off 1, because we have to be able to decode the non-expanded byte
585
    num_low_bytes = PyString_GET_SIZE(bytes) - 1
586
    while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
587
        val = val | ((c_bytes[offset] & 0x7F) << shift)
3735.40.16 by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex.
588
        shift = shift + 7
589
        offset = offset + 1
590
    if c_bytes[offset] & 0x80:
591
        raise ValueError('Data not properly formatted, we ran out of'
592
                         ' bytes before 0x80 stopped being set.')
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
593
    val = val | (c_bytes[offset] << shift)
3735.40.16 by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex.
594
    offset = offset + 1
595
    if val < 0:
596
        uval = <unsigned int> val
597
        return uval, offset
598
    return val, offset
599
600