~bzr-pqm/bzr/bzr.dev

0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
1
# Copyright (C) 2008 Canonical Limited.
2
# 
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
6
# 
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
# GNU General Public License for more details.
11
# 
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
15
# 
16
17
"""Compiled extensions for doing compression."""
18
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
19
cdef extern from *:
20
    ctypedef unsigned long size_t
21
    void * malloc(size_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
22
    void * realloc(void *, size_t)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
23
    void free(void *)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
24
    void memcpy(void *, void *, size_t)
25
26
cdef extern from "delta.h":
0.23.42 by John Arbash Meinel
Change the code around again.
27
    struct source_info:
28
        void *buf
29
        unsigned long size
30
        unsigned long agg_offset
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
31
    struct delta_index:
32
        unsigned long memsize
33
        void *src_buf
34
        unsigned long src_size
35
        unsigned int hash_mask
36
        # struct index_entry *hash[]
0.23.42 by John Arbash Meinel
Change the code around again.
37
    delta_index * create_delta_index(source_info *src)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
38
    void free_delta_index(delta_index *index)
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
39
    void *create_delta(delta_index **indexes,
40
             unsigned int num_indexes,
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
41
             void *buf, unsigned long bufsize,
42
             unsigned long *delta_size, unsigned long max_delta_size)
43
    unsigned long get_delta_hdr_size(unsigned char **datap,
44
                                     unsigned char *top)
45
    Py_ssize_t DELTA_SIZE_MIN
0.23.7 by John Arbash Meinel
Add a apply_delta2 function, just in case it matters.
46
    void *patch_delta(void *src_buf, unsigned long src_size,
47
                      void *delta_buf, unsigned long delta_size,
48
                      unsigned long *dst_size)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
49
50
cdef extern from "Python.h":
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
51
    int PyString_CheckExact(object)
52
    char * PyString_AS_STRING(object)
53
    Py_ssize_t PyString_GET_SIZE(object)
54
    object PyString_FromStringAndSize(char *, Py_ssize_t)
55
56
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
57
cdef void *safe_malloc(size_t count) except NULL:
58
    cdef void *result
59
    result = malloc(count)
60
    if result == NULL:
61
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
62
    return result
63
64
65
cdef void *safe_realloc(void * old, size_t count) except NULL:
66
    cdef void *result
67
    result = realloc(old, count)
68
    if result == NULL:
69
        raise MemoryError('Failed to reallocate to %d bytes of memory'
70
                          % (count,))
71
    return result
72
73
74
cdef int safe_free(void **val) except -1:
75
    assert val != NULL
76
    if val[0] != NULL:
77
        free(val[0])
78
        val[0] = NULL
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
79
0.23.17 by John Arbash Meinel
Create a wrapper function, so that lsprof will properly attribute time spent.
80
def make_delta_index(source):
81
    return DeltaIndex(source)
82
83
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
84
cdef class DeltaIndex:
85
0.23.40 by John Arbash Meinel
Add a comment why we aren't using the list type for _sources
86
    # We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
87
    # isn't performance critical
88
    # cdef readonly list _sources
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
89
    cdef readonly object _sources
0.23.42 by John Arbash Meinel
Change the code around again.
90
    cdef source_info *_source_infos
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
91
    cdef delta_index **_indexes
92
    cdef readonly unsigned int _num_indexes
93
    cdef readonly unsigned int _max_num_indexes
0.23.42 by John Arbash Meinel
Change the code around again.
94
    cdef readonly unsigned int _max_num_sources
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
95
    cdef public unsigned long _source_offset
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
96
97
    def __repr__(self):
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
98
        return '%s(%d, %d, %d)' % (self.__class__.__name__,
99
            len(self._sources), self._source_offset,
100
            self._num_indexes)
101
102
    def __init__(self, source=None):
103
        self._sources = []
104
        self._max_num_indexes = 1024
105
        self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*)
106
                                                   * self._max_num_indexes)
0.23.42 by John Arbash Meinel
Change the code around again.
107
        self._max_num_sources = 1024
108
        self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
109
                                                        * self._max_num_sources)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
110
        self._num_indexes = 0
111
        self._source_offset = 0
112
113
        if source is not None:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
114
            self.add_source(source, 0)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
115
116
    def __dealloc__(self):
117
        self._ensure_no_indexes()
0.23.42 by John Arbash Meinel
Change the code around again.
118
        safe_free(<void **>&self._source_infos)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
119
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
120
    def add_source(self, source, unadded_bytes):
121
        """Add a new bit of source text to the delta indexes.
122
123
        :param source: The text in question, this must be a byte string
124
        :param unadded_bytes: Assume there are this many bytes that didn't get
125
            added between this source and the end of the previous source.
126
        """
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
127
        cdef char *c_source
128
        cdef Py_ssize_t c_source_size
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
129
        cdef delta_index *index
0.23.42 by John Arbash Meinel
Change the code around again.
130
        cdef unsigned int source_location
131
        cdef source_info *src
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
132
        cdef unsigned int num_indexes
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
133
134
        if not PyString_CheckExact(source):
135
            raise TypeError('source is not a str')
136
0.23.42 by John Arbash Meinel
Change the code around again.
137
        source_location = len(self._sources)
138
        if source_location >= self._max_num_sources:
139
            self._expand_sources()
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
140
        self._sources.append(source)
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
141
        c_source = PyString_AS_STRING(source)
142
        c_source_size = PyString_GET_SIZE(source)
0.23.42 by John Arbash Meinel
Change the code around again.
143
        src = self._source_infos + source_location
144
        src.buf = c_source
145
        src.size = c_source_size
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
146
147
        # TODO: Are usage is ultimately going to be different than the one that
148
        #       was originally designed. Specifically, we are going to want to
149
        #       be able to update the index by hashing future data. It should
150
        #       fit just fine into the structure. But for now, we just wrap
151
        #       create_delta_index (For example, we could always reserve enough
152
        #       space to hash a 4MB string, etc.)
0.23.42 by John Arbash Meinel
Change the code around again.
153
        src.agg_offset = self._source_offset + unadded_bytes
154
        index = create_delta_index(src)
155
        self._source_offset = src.agg_offset + src.size
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
156
        if index != NULL:
157
            num_indexes = self._num_indexes + 1
158
            if num_indexes >= self._max_num_indexes:
159
                self._expand_indexes()
160
            self._indexes[self._num_indexes] = index
161
            self._num_indexes = num_indexes
162
163
    cdef _expand_indexes(self):
164
        self._max_num_indexes = self._max_num_indexes * 2
165
        self._indexes = <delta_index **>safe_realloc(self._indexes,
166
                                                sizeof(delta_index *)
167
                                                * self._max_num_indexes)
168
0.23.42 by John Arbash Meinel
Change the code around again.
169
    cdef _expand_sources(self):
170
        self._max_num_sources = self._max_num_sources * 2
171
        self._source_infos = <source_info *>safe_realloc(self._source_infos,
172
                                                sizeof(source_info)
173
                                                * self._max_num_sources)
174
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
175
    cdef _ensure_no_indexes(self):
176
        cdef int i
177
178
        if self._indexes != NULL:
179
            for i from 0 <= i < self._num_indexes:
180
                free_delta_index(self._indexes[i])
181
                self._indexes[i] = NULL
182
            free(self._indexes)
183
            self._indexes = NULL
184
            self._max_num_indexes = 0
185
            self._num_indexes = 0
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
186
187
    def make_delta(self, target_bytes, max_delta_size=0):
188
        """Create a delta from the current source to the target bytes."""
189
        cdef char *target
190
        cdef Py_ssize_t target_size
191
        cdef void * delta
192
        cdef unsigned long delta_size
193
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
194
        if self._num_indexes == 0:
0.23.15 by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig.
195
            return None
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
196
197
        if not PyString_CheckExact(target_bytes):
198
            raise TypeError('target is not a str')
199
200
        target = PyString_AS_STRING(target_bytes)
201
        target_size = PyString_GET_SIZE(target_bytes)
202
203
        # TODO: inline some of create_delta so we at least don't have to double
204
        #       malloc, and can instead use PyString_FromStringAndSize, to
205
        #       allocate the bytes into the final string
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
206
        delta = create_delta(self._indexes, self._num_indexes,
207
                             target, target_size,
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
208
                             &delta_size, max_delta_size)
209
        result = None
210
        if delta:
211
            result = PyString_FromStringAndSize(<char *>delta, delta_size)
212
            free(delta)
213
        return result
214
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
215
216
def make_delta(source_bytes, target_bytes):
0.23.42 by John Arbash Meinel
Change the code around again.
217
    """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
218
    di = DeltaIndex(source_bytes)
219
    return di.make_delta(target_bytes)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
220
221
222
def apply_delta(source_bytes, delta_bytes):
223
    """Apply a delta generated by make_delta to source_bytes."""
224
    cdef char *source
225
    cdef Py_ssize_t source_size
226
    cdef char *delta
227
    cdef Py_ssize_t delta_size
228
    cdef unsigned char *data, *top
229
    cdef unsigned char *dst_buf, *out, cmd
230
    cdef Py_ssize_t size
231
    cdef unsigned long cp_off, cp_size
232
233
    if not PyString_CheckExact(source_bytes):
234
        raise TypeError('source is not a str')
235
    if not PyString_CheckExact(delta_bytes):
236
        raise TypeError('delta is not a str')
237
238
    source = PyString_AS_STRING(source_bytes)
239
    source_size = PyString_GET_SIZE(source_bytes)
240
    delta = PyString_AS_STRING(delta_bytes)
241
    delta_size = PyString_GET_SIZE(delta_bytes)
242
243
    # Code taken from patch-delta.c, only brought here to give better error
244
    # handling, and to avoid double allocating memory
245
    if (delta_size < DELTA_SIZE_MIN):
246
        # XXX: Invalid delta block
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
247
        raise RuntimeError('delta_size %d smaller than min delta size %d'
248
                           % (delta_size, DELTA_SIZE_MIN))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
249
250
    data = <unsigned char *>delta
251
    top = data + delta_size
252
253
    # make sure the orig file size matches what we expect
254
    # XXX: gcc warns because data isn't defined as 'const'
255
    size = get_delta_hdr_size(&data, top)
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
256
    if (size > source_size):
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
257
        # XXX: mismatched source size
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
258
        raise RuntimeError('source size %d < expected source size %d'
259
                           % (source_size, size))
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
260
    source_size = size
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
261
262
    # now the result size
263
    size = get_delta_hdr_size(&data, top)
264
    result = PyString_FromStringAndSize(NULL, size)
265
    dst_buf = <unsigned char*>PyString_AS_STRING(result)
266
    # XXX: The original code added a trailing null here, but this shouldn't be
267
    #      necessary when using PyString_FromStringAndSize
268
    # dst_buf[size] = 0
269
270
    out = dst_buf
271
    while (data < top):
272
        cmd = data[0]
273
        data = data + 1
274
        if (cmd & 0x80):
275
            cp_off = cp_size = 0
276
            if (cmd & 0x01):
277
                cp_off = data[0]
278
                data = data + 1
279
            if (cmd & 0x02):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
280
                cp_off = cp_off | (data[0] << 8)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
281
                data = data + 1
282
            if (cmd & 0x04):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
283
                cp_off = cp_off | (data[0] << 16)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
284
                data = data + 1
285
            if (cmd & 0x08):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
286
                cp_off = cp_off | (data[0] << 24)
287
                data = data + 1
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
288
            if (cmd & 0x10):
289
                cp_size = data[0]
290
                data = data + 1
291
            if (cmd & 0x20):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
292
                cp_size = cp_size | (data[0] << 8)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
293
                data = data + 1
294
            if (cmd & 0x40):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
295
                cp_size = cp_size | (data[0] << 16)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
296
                data = data + 1
297
            if (cp_size == 0):
298
                cp_size = 0x10000
299
            if (cp_off + cp_size < cp_size or
300
                cp_off + cp_size > source_size or
301
                cp_size > size):
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
302
                raise RuntimeError('Something wrong with:'
303
                    ' cp_off = %s, cp_size = %s'
304
                    ' source_size = %s, size = %s'
305
                    % (cp_off, cp_size, source_size, size))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
306
            memcpy(out, source + cp_off, cp_size)
307
            out = out + cp_size
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
308
            size = size - cp_size
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
309
        elif (cmd):
310
            if (cmd > size):
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
311
                raise RuntimeError('Insert instruction longer than remaining'
312
                    ' bytes: %d > %d' % (cmd, size))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
313
            memcpy(out, data, cmd)
314
            out = out + cmd
315
            data = data + cmd
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
316
            size = size - cmd
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
317
        else:
318
            # /*
319
            #  * cmd == 0 is reserved for future encoding
320
            #  * extensions. In the mean time we must fail when
321
            #  * encountering them (might be data corruption).
322
            #  */
323
            ## /* XXX: error("unexpected delta opcode 0"); */
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
324
            raise RuntimeError('Got delta opcode: 0, not supported')
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
325
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
326
    # /* sanity check */
327
    if (data != top or size != 0):
328
        ## /* XXX: error("delta replay has gone wild"); */
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
329
        raise RuntimeError('Did not extract the number of bytes we expected'
330
            ' we were left with %d bytes in "size", and top - data = %d'
331
            % (size, <int>(top - data)))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
332
        return None
333
334
    # *dst_size = out - dst_buf;
335
    assert (out - dst_buf) == PyString_GET_SIZE(result)
336
    return result