1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Python version of compiled extensions for doing compression.
19
We separate the implementation from the groupcompress.py to avoid importing
23
from bzrlib import osutils
26
class LinesDeltaIndex(object):
27
"""This class indexes matches between strings.
29
:ivar lines: The 'static' lines that will be preserved between runs.
30
:ivar _matching_lines: A dict of {line:[matching offsets]}
31
:ivar line_offsets: The byte offset for the end of each line, used to
32
quickly map between a matching line number and the byte location
33
:ivar endpoint: The total number of bytes in self.line_offsets
36
def __init__(self, lines):
38
self.line_offsets = []
40
self._matching_lines = {}
41
self.extend_lines(lines, [True]*len(lines))
43
def _update_matching_lines(self, new_lines, index):
44
matches = self._matching_lines
45
start_idx = len(self.lines)
46
if len(new_lines) != len(index):
47
raise AssertionError('The number of lines to be indexed does'
48
' not match the index/don\'t index flags: %d != %d'
49
% (len(new_lines), len(index)))
50
for idx, do_index in enumerate(index):
53
matches.setdefault(new_lines[idx], []).append(start_idx + idx)
55
def get_matches(self, line):
56
"""Return the lines which match the line in right."""
58
return self._matching_lines[line]
62
def _get_longest_match(self, lines, pos, locations):
63
"""Look at all matches for the current line, return the longest.
65
:param lines: The lines we are matching against
66
:param pos: The current location we care about
67
:param locations: A list of lines that matched the current location.
68
This may be None, but often we'll have already found matches for
70
:return: (start_in_self, start_in_lines, num_lines)
71
All values are the offset in the list (aka the line number)
72
If start_in_self is None, then we have no matches, and this line
73
should be inserted in the target.
81
# TODO: is try/except better than get(..., None)?
83
locations = self._matching_lines[lines[pos]]
87
# No more matches, just return whatever we have, but we know
88
# that this last position is not going to match anything
94
# This is the first match in a range
95
copy_ends = [loc + 1 for loc in locations]
97
locations = None # Consumed
99
# We have a match started, compare to see if any of the
100
# current matches can be continued
101
next_locations = set(copy_ends).intersection(locations)
103
# At least one of the regions continues to match
104
copy_ends = [loc + 1 for loc in next_locations]
106
locations = None # Consumed
108
# All current regions no longer match.
109
# This line does still match something, just not at the
110
# end of the previous matches. We will return locations
111
# so that we can avoid another _matching_lines lookup.
114
if copy_ends is None:
115
# We have no matches, this is a pure insert
116
return None, pos, locations
117
return (((min(copy_ends) - range_len, range_start, range_len)),
120
def get_matching_blocks(self, lines, soft=False):
121
"""Return the ranges in lines which match self.lines.
123
:param lines: lines to compress
124
:return: A list of (old_start, new_start, length) tuples which reflect
125
a region in self.lines that is present in lines. The last element
126
of the list is always (old_len, new_len, 0) to provide a end point
127
for generating instructions from the matching blocks list.
129
# In this code, we iterate over multiple _get_longest_match calls, to
130
# find the next longest copy, and possible insert regions. We then
131
# convert that to the simple matching_blocks representation, since
132
# otherwise inserting 10 lines in a row would show up as 10
138
result_append = result.append
141
min_match_bytes = 200
143
block, pos, locations = self._get_longest_match(lines, pos,
145
if block is not None:
146
# Check to see if we match fewer than min_match_bytes. As we
147
# will turn this into a pure 'insert', rather than a copy.
148
# block[-1] is the number of lines. A quick check says if we
149
# have more lines than min_match_bytes, then we know we have
151
if block[-1] < min_match_bytes:
152
# This block may be a 'short' block, check
153
old_start, new_start, range_len = block
154
matched_bytes = sum(map(len,
155
lines[new_start:new_start + range_len]))
156
if matched_bytes < min_match_bytes:
158
if block is not None:
160
result_append((len(self.lines), len(lines), 0))
163
def extend_lines(self, lines, index):
164
"""Add more lines to the left-lines list.
166
:param lines: A list of lines to add
167
:param index: A True/False for each node to define if it should be
170
self._update_matching_lines(lines, index)
171
self.lines.extend(lines)
172
endpoint = self.endpoint
174
endpoint += len(line)
175
self.line_offsets.append(endpoint)
176
if len(self.line_offsets) != len(self.lines):
177
raise AssertionError('Somehow the line offset indicator'
178
' got out of sync with the line counter.')
179
self.endpoint = endpoint
181
def _flush_insert(self, start_linenum, end_linenum,
182
new_lines, out_lines, index_lines):
183
"""Add an 'insert' request to the data stream."""
184
bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
185
insert_length = len(bytes_to_insert)
186
# Each insert instruction is at most 127 bytes long
187
for start_byte in xrange(0, insert_length, 127):
188
insert_count = min(insert_length - start_byte, 127)
189
out_lines.append(chr(insert_count))
190
# Don't index the 'insert' instruction
191
index_lines.append(False)
192
insert = bytes_to_insert[start_byte:start_byte+insert_count]
193
as_lines = osutils.split_lines(insert)
194
out_lines.extend(as_lines)
195
index_lines.extend([True]*len(as_lines))
197
def _flush_copy(self, old_start_linenum, num_lines,
198
out_lines, index_lines):
199
if old_start_linenum == 0:
202
first_byte = self.line_offsets[old_start_linenum - 1]
203
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
204
num_bytes = stop_byte - first_byte
205
# The data stream allows >64kB in a copy, but to match the compiled
206
# code, we will also limit it to a 64kB copy
207
for start_byte in xrange(first_byte, stop_byte, 64*1024):
208
num_bytes = min(64*1024, stop_byte - first_byte)
209
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
210
out_lines.append(copy_bytes)
211
index_lines.append(False)
213
def make_delta(self, new_lines, bytes_length=None, soft=False):
214
"""Compute the delta for this content versus the original content."""
215
if bytes_length is None:
216
bytes_length = sum(map(len, new_lines))
217
# reserved for content type, content length
218
out_lines = ['', '', encode_base128_int(bytes_length)]
219
index_lines = [False, False, False]
220
blocks = self.get_matching_blocks(new_lines, soft=soft)
222
# We either copy a range (while there are reusable lines) or we
223
# insert new lines. To find reusable lines we traverse
224
for old_start, new_start, range_len in blocks:
225
if new_start != current_line_num:
226
# non-matching region, insert the content
227
self._flush_insert(current_line_num, new_start,
228
new_lines, out_lines, index_lines)
229
current_line_num = new_start + range_len
231
self._flush_copy(old_start, range_len, out_lines, index_lines)
232
return out_lines, index_lines
235
def encode_base128_int(val):
236
"""Convert an integer into a 7-bit lsb encoding."""
240
bytes.append(chr((val | 0x80) & 0xFF))
242
bytes.append(chr(val))
243
return ''.join(bytes)
246
def decode_base128_int(bytes):
247
"""Decode an integer from a 7-bit lsb encoding."""
251
bval = ord(bytes[offset])
253
val |= (bval & 0x7F) << shift
256
bval = ord(bytes[offset])
262
def encode_copy_instruction(offset, length):
263
"""Convert this offset into a control code and bytes."""
267
for copy_bit in (0x01, 0x02, 0x04, 0x08):
268
base_byte = offset & 0xff
270
copy_command |= copy_bit
271
copy_bytes.append(chr(base_byte))
274
# None is used by the test suite
275
copy_bytes[0] = chr(copy_command)
276
return ''.join(copy_bytes)
278
raise ValueError("we don't emit copy records for lengths > 64KiB")
280
raise ValueError("We cannot emit a copy of length 0")
281
if length != 0x10000:
282
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
283
# since that saves bytes for large chained copies
284
for copy_bit in (0x10, 0x20):
285
base_byte = length & 0xff
287
copy_command |= copy_bit
288
copy_bytes.append(chr(base_byte))
290
copy_bytes[0] = chr(copy_command)
291
return ''.join(copy_bytes)
294
def decode_copy_instruction(bytes, cmd, pos):
295
"""Decode a copy instruction from the next few bytes.
297
A copy instruction is a variable number of bytes, so we will parse the
298
bytes we care about, and return the new position, as well as the offset and
299
length referred to in the bytes.
301
:param bytes: A string of bytes
302
:param cmd: The command code
303
:param pos: The position in bytes right after the copy command
304
:return: (offset, length, newpos)
305
The offset of the copy start, the number of bytes to copy, and the
306
position after the last byte of the copy
308
if cmd & 0x80 != 0x80:
309
raise ValueError('copy instructions must have bit 0x80 set')
313
offset = ord(bytes[pos])
316
offset = offset | (ord(bytes[pos]) << 8)
319
offset = offset | (ord(bytes[pos]) << 16)
322
offset = offset | (ord(bytes[pos]) << 24)
325
length = ord(bytes[pos])
328
length = length | (ord(bytes[pos]) << 8)
331
length = length | (ord(bytes[pos]) << 16)
335
return (offset, length, pos)
338
def make_delta(source_bytes, target_bytes):
339
"""Create a delta from source to target."""
340
# TODO: The checks below may not be a the right place yet.
341
if type(source_bytes) is not str:
342
raise TypeError('source is not a str')
343
if type(target_bytes) is not str:
344
raise TypeError('target is not a str')
345
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
346
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
347
bytes_length=len(target_bytes))
348
return ''.join(delta)
351
def apply_delta(basis, delta):
352
"""Apply delta to this object to become new_version_id."""
353
if type(basis) is not str:
354
raise TypeError('basis is not a str')
355
if type(delta) is not str:
356
raise TypeError('delta is not a str')
357
target_length, pos = decode_base128_int(delta)
359
len_delta = len(delta)
360
while pos < len_delta:
361
cmd = ord(delta[pos])
364
offset, length, pos = decode_copy_instruction(delta, cmd, pos)
365
last = offset + length
366
if last > len(basis):
367
raise ValueError('data would copy bytes past the'
369
lines.append(basis[offset:last])
370
else: # Insert of 'cmd' bytes
372
raise ValueError('Command == 0 not supported yet')
373
lines.append(delta[pos:pos+cmd])
375
bytes = ''.join(lines)
376
if len(bytes) != target_length:
377
raise ValueError('Delta claimed to be %d long, but ended up'
378
' %d long' % (target_length, len(bytes)))
382
def apply_delta_to_source(source, delta_start, delta_end):
383
"""Extract a delta from source bytes, and apply it."""
384
source_size = len(source)
385
if delta_start >= source_size:
386
raise ValueError('delta starts after source')
387
if delta_end > source_size:
388
raise ValueError('delta ends after source')
389
if delta_start >= delta_end:
390
raise ValueError('delta starts after it ends')
391
delta_bytes = source[delta_start:delta_end]
392
return apply_delta(source, delta_bytes)