~bzr-pqm/bzr/bzr.dev

4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17
"""Python version of compiled extensions for doing compression.
18
19
We separate the implementation from the groupcompress.py to avoid importing
20
useless stuff.
21
"""
22
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
23
from bzrlib import osutils
24
25
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
26
class LinesDeltaIndex(object):
27
    """This class indexes matches between strings.
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
28
29
    :ivar lines: The 'static' lines that will be preserved between runs.
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
30
    :ivar _matching_lines: A dict of {line:[matching offsets]}
31
    :ivar line_offsets: The byte offset for the end of each line, used to
32
        quickly map between a matching line number and the byte location
33
    :ivar endpoint: The total number of bytes in self.line_offsets
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
34
    """
35
36
    def __init__(self, lines):
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
37
        self.lines = []
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
38
        self.line_offsets = []
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
39
        self.endpoint = 0
40
        self._matching_lines = {}
41
        self.extend_lines(lines, [True]*len(lines))
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
42
43
    def _update_matching_lines(self, new_lines, index):
44
        matches = self._matching_lines
45
        start_idx = len(self.lines)
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
46
        if len(new_lines) != len(index):
47
            raise AssertionError('The number of lines to be indexed does'
48
                ' not match the index/don\'t index flags: %d != %d'
49
                % (len(new_lines), len(index)))
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
50
        for idx, do_index in enumerate(index):
51
            if not do_index:
52
                continue
53
            matches.setdefault(new_lines[idx], []).append(start_idx + idx)
54
55
    def get_matches(self, line):
56
        """Return the lines which match the line in right."""
57
        try:
58
            return self._matching_lines[line]
59
        except KeyError:
60
            return None
61
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
62
    def _get_longest_match(self, lines, pos, locations):
63
        """Look at all matches for the current line, return the longest.
64
65
        :param lines: The lines we are matching against
66
        :param pos: The current location we care about
67
        :param locations: A list of lines that matched the current location.
68
            This may be None, but often we'll have already found matches for
69
            this line.
70
        :return: (start_in_self, start_in_lines, num_lines)
71
            All values are the offset in the list (aka the line number)
72
            If start_in_self is None, then we have no matches, and this line
73
            should be inserted in the target.
74
        """
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
75
        range_start = pos
76
        range_len = 0
77
        copy_ends = None
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
78
        max_pos = len(lines)
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
79
        while pos < max_pos:
80
            if locations is None:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
81
                # TODO: is try/except better than get(..., None)?
3735.40.14 by John Arbash Meinel
Get rid of the self._right_lines state. It doesn't matter anymore.
82
                try:
83
                    locations = self._matching_lines[lines[pos]]
84
                except KeyError:
85
                    locations = None
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
86
            if locations is None:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
87
                # No more matches, just return whatever we have, but we know
88
                # that this last position is not going to match anything
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
89
                pos += 1
90
                break
91
            else:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
92
                # We have a match
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
93
                if copy_ends is None:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
94
                    # This is the first match in a range
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
95
                    copy_ends = [loc + 1 for loc in locations]
96
                    range_len = 1
97
                    locations = None # Consumed
98
                else:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
99
                    # We have a match started, compare to see if any of the
100
                    # current matches can be continued
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
101
                    next_locations = set(copy_ends).intersection(locations)
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
102
                    if next_locations:
103
                        # At least one of the regions continues to match
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
104
                        copy_ends = [loc + 1 for loc in next_locations]
105
                        range_len += 1
106
                        locations = None # Consumed
107
                    else:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
108
                        # All current regions no longer match.
109
                        # This line does still match something, just not at the
110
                        # end of the previous matches. We will return locations
111
                        # so that we can avoid another _matching_lines lookup.
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
112
                        break
113
            pos += 1
114
        if copy_ends is None:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
115
            # We have no matches, this is a pure insert
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
116
            return None, pos, locations
117
        return (((min(copy_ends) - range_len, range_start, range_len)),
118
                pos, locations)
119
120
    def get_matching_blocks(self, lines, soft=False):
121
        """Return the ranges in lines which match self.lines.
122
123
        :param lines: lines to compress
124
        :return: A list of (old_start, new_start, length) tuples which reflect
125
            a region in self.lines that is present in lines.  The last element
126
            of the list is always (old_len, new_len, 0) to provide a end point
127
            for generating instructions from the matching blocks list.
128
        """
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
129
        # In this code, we iterate over multiple _get_longest_match calls, to
130
        # find the next longest copy, and possible insert regions. We then
131
        # convert that to the simple matching_blocks representation, since
132
        # otherwise inserting 10 lines in a row would show up as 10
133
        # instructions.
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
134
        result = []
135
        pos = 0
136
        locations = None
137
        max_pos = len(lines)
138
        result_append = result.append
139
        min_match_bytes = 10
140
        if soft:
141
            min_match_bytes = 200
142
        while pos < max_pos:
3735.40.14 by John Arbash Meinel
Get rid of the self._right_lines state. It doesn't matter anymore.
143
            block, pos, locations = self._get_longest_match(lines, pos,
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
144
                                                            locations)
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
145
            if block is not None:
3735.40.14 by John Arbash Meinel
Get rid of the self._right_lines state. It doesn't matter anymore.
146
                # Check to see if we match fewer than min_match_bytes. As we
147
                # will turn this into a pure 'insert', rather than a copy.
148
                # block[-1] is the number of lines. A quick check says if we
149
                # have more lines than min_match_bytes, then we know we have
150
                # enough bytes.
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
151
                if block[-1] < min_match_bytes:
152
                    # This block may be a 'short' block, check
153
                    old_start, new_start, range_len = block
154
                    matched_bytes = sum(map(len,
155
                        lines[new_start:new_start + range_len]))
156
                    if matched_bytes < min_match_bytes:
157
                        block = None
158
            if block is not None:
159
                result_append(block)
160
        result_append((len(self.lines), len(lines), 0))
161
        return result
162
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
163
    def extend_lines(self, lines, index):
164
        """Add more lines to the left-lines list.
165
166
        :param lines: A list of lines to add
167
        :param index: A True/False for each node to define if it should be
168
            indexed.
169
        """
170
        self._update_matching_lines(lines, index)
171
        self.lines.extend(lines)
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
172
        endpoint = self.endpoint
173
        for line in lines:
174
            endpoint += len(line)
175
            self.line_offsets.append(endpoint)
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
176
        if len(self.line_offsets) != len(self.lines):
177
            raise AssertionError('Somehow the line offset indicator'
178
                ' got out of sync with the line counter.')
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
179
        self.endpoint = endpoint
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
180
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
181
    def _flush_insert(self, start_linenum, end_linenum,
182
                      new_lines, out_lines, index_lines):
183
        """Add an 'insert' request to the data stream."""
184
        bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
185
        insert_length = len(bytes_to_insert)
186
        # Each insert instruction is at most 127 bytes long
187
        for start_byte in xrange(0, insert_length, 127):
188
            insert_count = min(insert_length - start_byte, 127)
189
            out_lines.append(chr(insert_count))
190
            # Don't index the 'insert' instruction
191
            index_lines.append(False)
192
            insert = bytes_to_insert[start_byte:start_byte+insert_count]
193
            as_lines = osutils.split_lines(insert)
194
            out_lines.extend(as_lines)
195
            index_lines.extend([True]*len(as_lines))
196
197
    def _flush_copy(self, old_start_linenum, num_lines,
198
                    out_lines, index_lines):
199
        if old_start_linenum == 0:
200
            first_byte = 0
201
        else:
202
            first_byte = self.line_offsets[old_start_linenum - 1]
203
        stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
204
        num_bytes = stop_byte - first_byte
205
        # The data stream allows >64kB in a copy, but to match the compiled
206
        # code, we will also limit it to a 64kB copy
207
        for start_byte in xrange(first_byte, stop_byte, 64*1024):
208
            num_bytes = min(64*1024, stop_byte - first_byte)
209
            copy_bytes = encode_copy_instruction(start_byte, num_bytes)
210
            out_lines.append(copy_bytes)
211
            index_lines.append(False)
212
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
213
    def make_delta(self, new_lines, bytes_length=None, soft=False):
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
214
        """Compute the delta for this content versus the original content."""
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
215
        if bytes_length is None:
216
            bytes_length = sum(map(len, new_lines))
217
        # reserved for content type, content length
218
        out_lines = ['', '', encode_base128_int(bytes_length)]
3735.40.10 by John Arbash Meinel
Merge in the new delta format code.
219
        index_lines = [False, False, False]
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
220
        blocks = self.get_matching_blocks(new_lines, soft=soft)
221
        current_line_num = 0
222
        # We either copy a range (while there are reusable lines) or we
223
        # insert new lines. To find reusable lines we traverse
224
        for old_start, new_start, range_len in blocks:
225
            if new_start != current_line_num:
226
                # non-matching region, insert the content
227
                self._flush_insert(current_line_num, new_start,
228
                                   new_lines, out_lines, index_lines)
229
            current_line_num = new_start + range_len
230
            if range_len:
231
                self._flush_copy(old_start, range_len, out_lines, index_lines)
232
        return out_lines, index_lines
233
234
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
235
def encode_base128_int(val):
236
    """Convert an integer into a 7-bit lsb encoding."""
237
    bytes = []
238
    count = 0
239
    while val >= 0x80:
240
        bytes.append(chr((val | 0x80) & 0xFF))
241
        val >>= 7
242
    bytes.append(chr(val))
243
    return ''.join(bytes)
244
245
246
def decode_base128_int(bytes):
247
    """Decode an integer from a 7-bit lsb encoding."""
248
    offset = 0
249
    val = 0
250
    shift = 0
251
    bval = ord(bytes[offset])
252
    while bval >= 0x80:
253
        val |= (bval & 0x7F) << shift
254
        shift += 7
255
        offset += 1
256
        bval = ord(bytes[offset])
257
    val |= bval << shift
258
    offset += 1
259
    return val, offset
260
261
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
262
def encode_copy_instruction(offset, length):
263
    """Convert this offset into a control code and bytes."""
264
    copy_command = 0x80
265
    copy_bytes = [None]
266
267
    for copy_bit in (0x01, 0x02, 0x04, 0x08):
268
        base_byte = offset & 0xff
269
        if base_byte:
270
            copy_command |= copy_bit
271
            copy_bytes.append(chr(base_byte))
272
        offset >>= 8
273
    if length is None:
274
        # None is used by the test suite
275
        copy_bytes[0] = chr(copy_command)
276
        return ''.join(copy_bytes)
277
    if length > 0x10000:
278
        raise ValueError("we don't emit copy records for lengths > 64KiB")
279
    if length == 0:
280
        raise ValueError("We cannot emit a copy of length 0")
281
    if length != 0x10000:
282
        # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
283
        # since that saves bytes for large chained copies
284
        for copy_bit in (0x10, 0x20):
285
            base_byte = length & 0xff
286
            if base_byte:
287
                copy_command |= copy_bit
288
                copy_bytes.append(chr(base_byte))
289
            length >>= 8
290
    copy_bytes[0] = chr(copy_command)
291
    return ''.join(copy_bytes)
292
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
293
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
294
def decode_copy_instruction(bytes, cmd, pos):
295
    """Decode a copy instruction from the next few bytes.
296
297
    A copy instruction is a variable number of bytes, so we will parse the
298
    bytes we care about, and return the new position, as well as the offset and
299
    length referred to in the bytes.
300
301
    :param bytes: A string of bytes
302
    :param cmd: The command code
303
    :param pos: The position in bytes right after the copy command
304
    :return: (offset, length, newpos)
305
        The offset of the copy start, the number of bytes to copy, and the
306
        position after the last byte of the copy
307
    """
308
    if cmd & 0x80 != 0x80:
309
        raise ValueError('copy instructions must have bit 0x80 set')
310
    offset = 0
311
    length = 0
312
    if (cmd & 0x01):
313
        offset = ord(bytes[pos])
314
        pos += 1
315
    if (cmd & 0x02):
316
        offset = offset | (ord(bytes[pos]) << 8)
317
        pos += 1
318
    if (cmd & 0x04):
319
        offset = offset | (ord(bytes[pos]) << 16)
320
        pos += 1
321
    if (cmd & 0x08):
322
        offset = offset | (ord(bytes[pos]) << 24)
323
        pos += 1
324
    if (cmd & 0x10):
325
        length = ord(bytes[pos])
326
        pos += 1
327
    if (cmd & 0x20):
328
        length = length | (ord(bytes[pos]) << 8)
329
        pos += 1
330
    if (cmd & 0x40):
331
        length = length | (ord(bytes[pos]) << 16)
332
        pos += 1
333
    if length == 0:
334
        length = 65536
335
    return (offset, length, pos)
336
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
337
338
def make_delta(source_bytes, target_bytes):
339
    """Create a delta from source to target."""
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
340
    # TODO: The checks below may not be a the right place yet.
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
341
    if type(source_bytes) is not str:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
342
        raise TypeError('source is not a str')
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
343
    if type(target_bytes) is not str:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
344
        raise TypeError('target is not a str')
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
345
    line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
346
    delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
347
                                         bytes_length=len(target_bytes))
348
    return ''.join(delta)
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
349
350
351
def apply_delta(basis, delta):
352
    """Apply delta to this object to become new_version_id."""
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
353
    if type(basis) is not str:
354
        raise TypeError('basis is not a str')
355
    if type(delta) is not str:
356
        raise TypeError('delta is not a str')
357
    target_length, pos = decode_base128_int(delta)
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
358
    lines = []
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
359
    len_delta = len(delta)
360
    while pos < len_delta:
361
        cmd = ord(delta[pos])
362
        pos += 1
363
        if cmd & 0x80:
364
            offset, length, pos = decode_copy_instruction(delta, cmd, pos)
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
365
            last = offset + length
366
            if last > len(basis):
367
                raise ValueError('data would copy bytes past the'
368
                                 'end of source')
369
            lines.append(basis[offset:last])
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
370
        else: # Insert of 'cmd' bytes
371
            if cmd == 0:
372
                raise ValueError('Command == 0 not supported yet')
373
            lines.append(delta[pos:pos+cmd])
374
            pos += cmd
375
    bytes = ''.join(lines)
376
    if len(bytes) != target_length:
377
        raise ValueError('Delta claimed to be %d long, but ended up'
378
                         ' %d long' % (target_length, len(bytes)))
379
    return bytes
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
380
381
382
def apply_delta_to_source(source, delta_start, delta_end):
383
    """Extract a delta from source bytes, and apply it."""
384
    source_size = len(source)
385
    if delta_start >= source_size:
386
        raise ValueError('delta starts after source')
387
    if delta_end > source_size:
388
        raise ValueError('delta ends after source')
389
    if delta_start >= delta_end:
390
        raise ValueError('delta starts after it ends')
391
    delta_bytes = source[delta_start:delta_end]
392
    return apply_delta(source, delta_bytes)