~bzr-pqm/bzr/bzr.dev

4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17
"""Python version of compiled extensions for doing compression.
18
19
We separate the implementation from the groupcompress.py to avoid importing
20
useless stuff.
21
"""
22
6379.6.7 by Jelmer Vernooij
Move importing from future until after doc string, otherwise the doc string will disappear.
23
from __future__ import absolute_import
24
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
25
from bzrlib import osutils
26
27
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
28
class _OutputHandler(object):
29
    """A simple class which just tracks how to split up an insert request."""
30
31
    def __init__(self, out_lines, index_lines, min_len_to_index):
32
        self.out_lines = out_lines
33
        self.index_lines = index_lines
34
        self.min_len_to_index = min_len_to_index
35
        self.cur_insert_lines = []
36
        self.cur_insert_len = 0
37
38
    def add_copy(self, start_byte, end_byte):
39
        # The data stream allows >64kB in a copy, but to match the compiled
40
        # code, we will also limit it to a 64kB copy
41
        for start_byte in xrange(start_byte, end_byte, 64*1024):
42
            num_bytes = min(64*1024, end_byte - start_byte)
43
            copy_bytes = encode_copy_instruction(start_byte, num_bytes)
44
            self.out_lines.append(copy_bytes)
45
            self.index_lines.append(False)
46
47
    def _flush_insert(self):
48
        if not self.cur_insert_lines:
49
            return
4300.1.7 by John Arbash Meinel
Bring in the other test cases.
50
        if self.cur_insert_len > 127:
51
            raise AssertionError('We cannot insert more than 127 bytes'
52
                                 ' at a time.')
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
53
        self.out_lines.append(chr(self.cur_insert_len))
54
        self.index_lines.append(False)
55
        self.out_lines.extend(self.cur_insert_lines)
56
        if self.cur_insert_len < self.min_len_to_index:
57
            self.index_lines.extend([False]*len(self.cur_insert_lines))
58
        else:
59
            self.index_lines.extend([True]*len(self.cur_insert_lines))
60
        self.cur_insert_lines = []
61
        self.cur_insert_len = 0
62
63
    def _insert_long_line(self, line):
64
        # Flush out anything pending
65
        self._flush_insert()
66
        line_len = len(line)
67
        for start_index in xrange(0, line_len, 127):
68
            next_len = min(127, line_len - start_index)
69
            self.out_lines.append(chr(next_len))
70
            self.index_lines.append(False)
71
            self.out_lines.append(line[start_index:start_index+next_len])
72
            # We don't index long lines, because we won't be able to match
73
            # a line split across multiple inserts anway
74
            self.index_lines.append(False)
75
76
    def add_insert(self, lines):
4300.1.7 by John Arbash Meinel
Bring in the other test cases.
77
        if self.cur_insert_lines != []:
78
            raise AssertionError('self.cur_insert_lines must be empty when'
79
                                 ' adding a new insert')
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
80
        for line in lines:
81
            if len(line) > 127:
82
                self._insert_long_line(line)
83
            else:
84
                next_len = len(line) + self.cur_insert_len
85
                if next_len > 127:
86
                    # Adding this line would overflow, so flush, and start over
87
                    self._flush_insert()
88
                    self.cur_insert_lines = [line]
89
                    self.cur_insert_len = len(line)
90
                else:
91
                    self.cur_insert_lines.append(line)
92
                    self.cur_insert_len = next_len
93
        self._flush_insert()
94
95
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
96
class LinesDeltaIndex(object):
97
    """This class indexes matches between strings.
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
98
99
    :ivar lines: The 'static' lines that will be preserved between runs.
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
100
    :ivar _matching_lines: A dict of {line:[matching offsets]}
101
    :ivar line_offsets: The byte offset for the end of each line, used to
102
        quickly map between a matching line number and the byte location
103
    :ivar endpoint: The total number of bytes in self.line_offsets
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
104
    """
105
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
106
    _MIN_MATCH_BYTES = 10
107
    _SOFT_MIN_MATCH_BYTES = 200
108
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
109
    def __init__(self, lines):
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
110
        self.lines = []
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
111
        self.line_offsets = []
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
112
        self.endpoint = 0
113
        self._matching_lines = {}
114
        self.extend_lines(lines, [True]*len(lines))
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
115
116
    def _update_matching_lines(self, new_lines, index):
117
        matches = self._matching_lines
118
        start_idx = len(self.lines)
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
119
        if len(new_lines) != len(index):
120
            raise AssertionError('The number of lines to be indexed does'
121
                ' not match the index/don\'t index flags: %d != %d'
122
                % (len(new_lines), len(index)))
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
123
        for idx, do_index in enumerate(index):
124
            if not do_index:
125
                continue
4300.1.4 by John Arbash Meinel
Change self._matching_lines to use a set rather than a list.
126
            line = new_lines[idx]
127
            try:
128
                matches[line].add(start_idx + idx)
129
            except KeyError:
130
                matches[line] = set([start_idx + idx])
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
131
132
    def get_matches(self, line):
133
        """Return the lines which match the line in right."""
134
        try:
135
            return self._matching_lines[line]
136
        except KeyError:
137
            return None
138
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
139
    def _get_longest_match(self, lines, pos):
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
140
        """Look at all matches for the current line, return the longest.
141
142
        :param lines: The lines we are matching against
143
        :param pos: The current location we care about
144
        :param locations: A list of lines that matched the current location.
145
            This may be None, but often we'll have already found matches for
146
            this line.
147
        :return: (start_in_self, start_in_lines, num_lines)
148
            All values are the offset in the list (aka the line number)
149
            If start_in_self is None, then we have no matches, and this line
150
            should be inserted in the target.
151
        """
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
152
        range_start = pos
153
        range_len = 0
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
154
        prev_locations = None
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
155
        max_pos = len(lines)
4300.1.4 by John Arbash Meinel
Change self._matching_lines to use a set rather than a list.
156
        matching = self._matching_lines
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
157
        while pos < max_pos:
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
158
            try:
159
                locations = matching[lines[pos]]
160
            except KeyError:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
161
                # No more matches, just return whatever we have, but we know
162
                # that this last position is not going to match anything
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
163
                pos += 1
164
                break
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
165
            # We have a match
166
            if prev_locations is None:
167
                # This is the first match in a range
168
                prev_locations = locations
169
                range_len = 1
170
                locations = None # Consumed
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
171
            else:
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
172
                # We have a match started, compare to see if any of the
173
                # current matches can be continued
174
                next_locations = locations.intersection([loc + 1 for loc
175
                                                         in prev_locations])
176
                if next_locations:
177
                    # At least one of the regions continues to match
178
                    prev_locations = set(next_locations)
179
                    range_len += 1
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
180
                    locations = None # Consumed
181
                else:
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
182
                    # All current regions no longer match.
183
                    # This line does still match something, just not at the
184
                    # end of the previous matches. We will return locations
185
                    # so that we can avoid another _matching_lines lookup.
186
                    break
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
187
            pos += 1
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
188
        if prev_locations is None:
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
189
            # We have no matches, this is a pure insert
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
190
            return None, pos
191
        smallest = min(prev_locations)
192
        return (smallest - range_len + 1, range_start, range_len), pos
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
193
194
    def get_matching_blocks(self, lines, soft=False):
195
        """Return the ranges in lines which match self.lines.
196
197
        :param lines: lines to compress
198
        :return: A list of (old_start, new_start, length) tuples which reflect
199
            a region in self.lines that is present in lines.  The last element
200
            of the list is always (old_len, new_len, 0) to provide a end point
201
            for generating instructions from the matching blocks list.
202
        """
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
203
        # In this code, we iterate over multiple _get_longest_match calls, to
204
        # find the next longest copy, and possible insert regions. We then
205
        # convert that to the simple matching_blocks representation, since
206
        # otherwise inserting 10 lines in a row would show up as 10
207
        # instructions.
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
208
        result = []
209
        pos = 0
210
        max_pos = len(lines)
211
        result_append = result.append
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
212
        min_match_bytes = self._MIN_MATCH_BYTES
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
213
        if soft:
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
214
            min_match_bytes = self._SOFT_MIN_MATCH_BYTES
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
215
        while pos < max_pos:
4300.1.5 by John Arbash Meinel
A couple more cleanups on the pure-python implementation.
216
            block, pos = self._get_longest_match(lines, pos)
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
217
            if block is not None:
3735.40.14 by John Arbash Meinel
Get rid of the self._right_lines state. It doesn't matter anymore.
218
                # Check to see if we match fewer than min_match_bytes. As we
219
                # will turn this into a pure 'insert', rather than a copy.
220
                # block[-1] is the number of lines. A quick check says if we
221
                # have more lines than min_match_bytes, then we know we have
222
                # enough bytes.
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
223
                if block[-1] < min_match_bytes:
224
                    # This block may be a 'short' block, check
225
                    old_start, new_start, range_len = block
226
                    matched_bytes = sum(map(len,
227
                        lines[new_start:new_start + range_len]))
228
                    if matched_bytes < min_match_bytes:
229
                        block = None
230
            if block is not None:
231
                result_append(block)
232
        result_append((len(self.lines), len(lines), 0))
233
        return result
234
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
235
    def extend_lines(self, lines, index):
236
        """Add more lines to the left-lines list.
237
238
        :param lines: A list of lines to add
239
        :param index: A True/False for each node to define if it should be
240
            indexed.
241
        """
242
        self._update_matching_lines(lines, index)
243
        self.lines.extend(lines)
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
244
        endpoint = self.endpoint
245
        for line in lines:
246
            endpoint += len(line)
247
            self.line_offsets.append(endpoint)
3735.40.15 by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code.
248
        if len(self.line_offsets) != len(self.lines):
249
            raise AssertionError('Somehow the line offset indicator'
250
                ' got out of sync with the line counter.')
3735.40.6 by John Arbash Meinel
Move more information down into EquivalenceTable.
251
        self.endpoint = endpoint
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
252
253
    def _flush_insert(self, start_linenum, end_linenum,
254
                      new_lines, out_lines, index_lines):
255
        """Add an 'insert' request to the data stream."""
256
        bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
257
        insert_length = len(bytes_to_insert)
258
        # Each insert instruction is at most 127 bytes long
259
        for start_byte in xrange(0, insert_length, 127):
260
            insert_count = min(insert_length - start_byte, 127)
261
            out_lines.append(chr(insert_count))
262
            # Don't index the 'insert' instruction
263
            index_lines.append(False)
264
            insert = bytes_to_insert[start_byte:start_byte+insert_count]
265
            as_lines = osutils.split_lines(insert)
266
            out_lines.extend(as_lines)
267
            index_lines.extend([True]*len(as_lines))
268
269
    def _flush_copy(self, old_start_linenum, num_lines,
270
                    out_lines, index_lines):
271
        if old_start_linenum == 0:
272
            first_byte = 0
273
        else:
274
            first_byte = self.line_offsets[old_start_linenum - 1]
275
        stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
276
        num_bytes = stop_byte - first_byte
277
        # The data stream allows >64kB in a copy, but to match the compiled
278
        # code, we will also limit it to a 64kB copy
279
        for start_byte in xrange(first_byte, stop_byte, 64*1024):
4241.20.1 by John Arbash Meinel
Cherrypick the python groupcompress fix for bzr-1.14-final
280
            num_bytes = min(64*1024, stop_byte - start_byte)
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
281
            copy_bytes = encode_copy_instruction(start_byte, num_bytes)
282
            out_lines.append(copy_bytes)
283
            index_lines.append(False)
284
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
285
    def make_delta(self, new_lines, bytes_length=None, soft=False):
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
286
        """Compute the delta for this content versus the original content."""
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
287
        if bytes_length is None:
288
            bytes_length = sum(map(len, new_lines))
289
        # reserved for content type, content length
290
        out_lines = ['', '', encode_base128_int(bytes_length)]
3735.40.10 by John Arbash Meinel
Merge in the new delta format code.
291
        index_lines = [False, False, False]
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
292
        output_handler = _OutputHandler(out_lines, index_lines,
293
                                        self._MIN_MATCH_BYTES)
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
294
        blocks = self.get_matching_blocks(new_lines, soft=soft)
295
        current_line_num = 0
296
        # We either copy a range (while there are reusable lines) or we
297
        # insert new lines. To find reusable lines we traverse
298
        for old_start, new_start, range_len in blocks:
299
            if new_start != current_line_num:
300
                # non-matching region, insert the content
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
301
                output_handler.add_insert(new_lines[current_line_num:new_start])
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
302
            current_line_num = new_start + range_len
303
            if range_len:
4300.1.2 by John Arbash Meinel
Change the pure-python compressor a bit.
304
                # Convert the line based offsets into byte based offsets
305
                if old_start == 0:
306
                    first_byte = 0
307
                else:
308
                    first_byte = self.line_offsets[old_start - 1]
309
                last_byte = self.line_offsets[old_start + range_len - 1]
310
                output_handler.add_copy(first_byte, last_byte)
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
311
        return out_lines, index_lines
312
313
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
314
def encode_base128_int(val):
315
    """Convert an integer into a 7-bit lsb encoding."""
316
    bytes = []
317
    count = 0
318
    while val >= 0x80:
319
        bytes.append(chr((val | 0x80) & 0xFF))
320
        val >>= 7
321
    bytes.append(chr(val))
322
    return ''.join(bytes)
323
324
325
def decode_base128_int(bytes):
326
    """Decode an integer from a 7-bit lsb encoding."""
327
    offset = 0
328
    val = 0
329
    shift = 0
330
    bval = ord(bytes[offset])
331
    while bval >= 0x80:
332
        val |= (bval & 0x7F) << shift
333
        shift += 7
334
        offset += 1
335
        bval = ord(bytes[offset])
336
    val |= bval << shift
337
    offset += 1
338
    return val, offset
339
340
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
341
def encode_copy_instruction(offset, length):
342
    """Convert this offset into a control code and bytes."""
343
    copy_command = 0x80
344
    copy_bytes = [None]
345
346
    for copy_bit in (0x01, 0x02, 0x04, 0x08):
347
        base_byte = offset & 0xff
348
        if base_byte:
349
            copy_command |= copy_bit
350
            copy_bytes.append(chr(base_byte))
351
        offset >>= 8
352
    if length is None:
4300.2.1 by John Arbash Meinel
Fix bug #364900, properly remove the 64kB that was just encoded in the copy.
353
        raise ValueError("cannot supply a length of None")
3735.40.7 by John Arbash Meinel
Move even more functionality into EquivalenceTable.
354
    if length > 0x10000:
355
        raise ValueError("we don't emit copy records for lengths > 64KiB")
356
    if length == 0:
357
        raise ValueError("We cannot emit a copy of length 0")
358
    if length != 0x10000:
359
        # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
360
        # since that saves bytes for large chained copies
361
        for copy_bit in (0x10, 0x20):
362
            base_byte = length & 0xff
363
            if base_byte:
364
                copy_command |= copy_bit
365
                copy_bytes.append(chr(base_byte))
366
            length >>= 8
367
    copy_bytes[0] = chr(copy_command)
368
    return ''.join(copy_bytes)
369
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
370
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
371
def decode_copy_instruction(bytes, cmd, pos):
372
    """Decode a copy instruction from the next few bytes.
373
374
    A copy instruction is a variable number of bytes, so we will parse the
375
    bytes we care about, and return the new position, as well as the offset and
376
    length referred to in the bytes.
377
378
    :param bytes: A string of bytes
379
    :param cmd: The command code
380
    :param pos: The position in bytes right after the copy command
381
    :return: (offset, length, newpos)
382
        The offset of the copy start, the number of bytes to copy, and the
383
        position after the last byte of the copy
384
    """
385
    if cmd & 0x80 != 0x80:
386
        raise ValueError('copy instructions must have bit 0x80 set')
387
    offset = 0
388
    length = 0
389
    if (cmd & 0x01):
390
        offset = ord(bytes[pos])
391
        pos += 1
392
    if (cmd & 0x02):
393
        offset = offset | (ord(bytes[pos]) << 8)
394
        pos += 1
395
    if (cmd & 0x04):
396
        offset = offset | (ord(bytes[pos]) << 16)
397
        pos += 1
398
    if (cmd & 0x08):
399
        offset = offset | (ord(bytes[pos]) << 24)
400
        pos += 1
401
    if (cmd & 0x10):
402
        length = ord(bytes[pos])
403
        pos += 1
404
    if (cmd & 0x20):
405
        length = length | (ord(bytes[pos]) << 8)
406
        pos += 1
407
    if (cmd & 0x40):
408
        length = length | (ord(bytes[pos]) << 16)
409
        pos += 1
410
    if length == 0:
411
        length = 65536
412
    return (offset, length, pos)
413
3735.40.5 by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx
414
415
def make_delta(source_bytes, target_bytes):
416
    """Create a delta from source to target."""
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
417
    if type(source_bytes) is not str:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
418
        raise TypeError('source is not a str')
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
419
    if type(target_bytes) is not str:
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
420
        raise TypeError('target is not a str')
3735.40.13 by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex.
421
    line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
422
    delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
423
                                         bytes_length=len(target_bytes))
424
    return ''.join(delta)
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
425
426
427
def apply_delta(basis, delta):
428
    """Apply delta to this object to become new_version_id."""
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
429
    if type(basis) is not str:
430
        raise TypeError('basis is not a str')
431
    if type(delta) is not str:
432
        raise TypeError('delta is not a str')
433
    target_length, pos = decode_base128_int(delta)
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
434
    lines = []
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
435
    len_delta = len(delta)
436
    while pos < len_delta:
437
        cmd = ord(delta[pos])
438
        pos += 1
439
        if cmd & 0x80:
440
            offset, length, pos = decode_copy_instruction(delta, cmd, pos)
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
441
            last = offset + length
442
            if last > len(basis):
443
                raise ValueError('data would copy bytes past the'
444
                                 'end of source')
445
            lines.append(basis[offset:last])
3735.40.11 by John Arbash Meinel
Implement make_delta and apply_delta.
446
        else: # Insert of 'cmd' bytes
447
            if cmd == 0:
448
                raise ValueError('Command == 0 not supported yet')
449
            lines.append(delta[pos:pos+cmd])
450
            pos += cmd
451
    bytes = ''.join(lines)
452
    if len(bytes) != target_length:
453
        raise ValueError('Delta claimed to be %d long, but ended up'
454
                         ' %d long' % (target_length, len(bytes)))
455
    return bytes
3735.40.19 by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string.
456
457
458
def apply_delta_to_source(source, delta_start, delta_end):
459
    """Extract a delta from source bytes, and apply it."""
460
    source_size = len(source)
461
    if delta_start >= source_size:
462
        raise ValueError('delta starts after source')
463
    if delta_end > source_size:
464
        raise ValueError('delta ends after source')
465
    if delta_start >= delta_end:
466
        raise ValueError('delta starts after it ends')
467
    delta_bytes = source[delta_start:delta_end]
468
    return apply_delta(source, delta_bytes)