1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Python version of compiled extensions for doing compression.
19
We separate the implementation from the groupcompress.py to avoid importing
23
from __future__ import absolute_import
25
from bzrlib import osutils
28
class _OutputHandler(object):
29
"""A simple class which just tracks how to split up an insert request."""
31
def __init__(self, out_lines, index_lines, min_len_to_index):
32
self.out_lines = out_lines
33
self.index_lines = index_lines
34
self.min_len_to_index = min_len_to_index
35
self.cur_insert_lines = []
36
self.cur_insert_len = 0
38
def add_copy(self, start_byte, end_byte):
39
# The data stream allows >64kB in a copy, but to match the compiled
40
# code, we will also limit it to a 64kB copy
41
for start_byte in xrange(start_byte, end_byte, 64*1024):
42
num_bytes = min(64*1024, end_byte - start_byte)
43
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
44
self.out_lines.append(copy_bytes)
45
self.index_lines.append(False)
47
def _flush_insert(self):
48
if not self.cur_insert_lines:
50
if self.cur_insert_len > 127:
51
raise AssertionError('We cannot insert more than 127 bytes'
53
self.out_lines.append(chr(self.cur_insert_len))
54
self.index_lines.append(False)
55
self.out_lines.extend(self.cur_insert_lines)
56
if self.cur_insert_len < self.min_len_to_index:
57
self.index_lines.extend([False]*len(self.cur_insert_lines))
59
self.index_lines.extend([True]*len(self.cur_insert_lines))
60
self.cur_insert_lines = []
61
self.cur_insert_len = 0
63
def _insert_long_line(self, line):
64
# Flush out anything pending
67
for start_index in xrange(0, line_len, 127):
68
next_len = min(127, line_len - start_index)
69
self.out_lines.append(chr(next_len))
70
self.index_lines.append(False)
71
self.out_lines.append(line[start_index:start_index+next_len])
72
# We don't index long lines, because we won't be able to match
73
# a line split across multiple inserts anway
74
self.index_lines.append(False)
76
def add_insert(self, lines):
77
if self.cur_insert_lines != []:
78
raise AssertionError('self.cur_insert_lines must be empty when'
79
' adding a new insert')
82
self._insert_long_line(line)
84
next_len = len(line) + self.cur_insert_len
86
# Adding this line would overflow, so flush, and start over
88
self.cur_insert_lines = [line]
89
self.cur_insert_len = len(line)
91
self.cur_insert_lines.append(line)
92
self.cur_insert_len = next_len
96
class LinesDeltaIndex(object):
97
"""This class indexes matches between strings.
99
:ivar lines: The 'static' lines that will be preserved between runs.
100
:ivar _matching_lines: A dict of {line:[matching offsets]}
101
:ivar line_offsets: The byte offset for the end of each line, used to
102
quickly map between a matching line number and the byte location
103
:ivar endpoint: The total number of bytes in self.line_offsets
106
_MIN_MATCH_BYTES = 10
107
_SOFT_MIN_MATCH_BYTES = 200
109
def __init__(self, lines):
111
self.line_offsets = []
113
self._matching_lines = {}
114
self.extend_lines(lines, [True]*len(lines))
116
def _update_matching_lines(self, new_lines, index):
117
matches = self._matching_lines
118
start_idx = len(self.lines)
119
if len(new_lines) != len(index):
120
raise AssertionError('The number of lines to be indexed does'
121
' not match the index/don\'t index flags: %d != %d'
122
% (len(new_lines), len(index)))
123
for idx, do_index in enumerate(index):
126
line = new_lines[idx]
128
matches[line].add(start_idx + idx)
130
matches[line] = set([start_idx + idx])
132
def get_matches(self, line):
133
"""Return the lines which match the line in right."""
135
return self._matching_lines[line]
139
def _get_longest_match(self, lines, pos):
140
"""Look at all matches for the current line, return the longest.
142
:param lines: The lines we are matching against
143
:param pos: The current location we care about
144
:param locations: A list of lines that matched the current location.
145
This may be None, but often we'll have already found matches for
147
:return: (start_in_self, start_in_lines, num_lines)
148
All values are the offset in the list (aka the line number)
149
If start_in_self is None, then we have no matches, and this line
150
should be inserted in the target.
154
prev_locations = None
156
matching = self._matching_lines
159
locations = matching[lines[pos]]
161
# No more matches, just return whatever we have, but we know
162
# that this last position is not going to match anything
166
if prev_locations is None:
167
# This is the first match in a range
168
prev_locations = locations
170
locations = None # Consumed
172
# We have a match started, compare to see if any of the
173
# current matches can be continued
174
next_locations = locations.intersection([loc + 1 for loc
177
# At least one of the regions continues to match
178
prev_locations = set(next_locations)
180
locations = None # Consumed
182
# All current regions no longer match.
183
# This line does still match something, just not at the
184
# end of the previous matches. We will return locations
185
# so that we can avoid another _matching_lines lookup.
188
if prev_locations is None:
189
# We have no matches, this is a pure insert
191
smallest = min(prev_locations)
192
return (smallest - range_len + 1, range_start, range_len), pos
194
def get_matching_blocks(self, lines, soft=False):
195
"""Return the ranges in lines which match self.lines.
197
:param lines: lines to compress
198
:return: A list of (old_start, new_start, length) tuples which reflect
199
a region in self.lines that is present in lines. The last element
200
of the list is always (old_len, new_len, 0) to provide a end point
201
for generating instructions from the matching blocks list.
203
# In this code, we iterate over multiple _get_longest_match calls, to
204
# find the next longest copy, and possible insert regions. We then
205
# convert that to the simple matching_blocks representation, since
206
# otherwise inserting 10 lines in a row would show up as 10
211
result_append = result.append
212
min_match_bytes = self._MIN_MATCH_BYTES
214
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
216
block, pos = self._get_longest_match(lines, pos)
217
if block is not None:
218
# Check to see if we match fewer than min_match_bytes. As we
219
# will turn this into a pure 'insert', rather than a copy.
220
# block[-1] is the number of lines. A quick check says if we
221
# have more lines than min_match_bytes, then we know we have
223
if block[-1] < min_match_bytes:
224
# This block may be a 'short' block, check
225
old_start, new_start, range_len = block
226
matched_bytes = sum(map(len,
227
lines[new_start:new_start + range_len]))
228
if matched_bytes < min_match_bytes:
230
if block is not None:
232
result_append((len(self.lines), len(lines), 0))
235
def extend_lines(self, lines, index):
236
"""Add more lines to the left-lines list.
238
:param lines: A list of lines to add
239
:param index: A True/False for each node to define if it should be
242
self._update_matching_lines(lines, index)
243
self.lines.extend(lines)
244
endpoint = self.endpoint
246
endpoint += len(line)
247
self.line_offsets.append(endpoint)
248
if len(self.line_offsets) != len(self.lines):
249
raise AssertionError('Somehow the line offset indicator'
250
' got out of sync with the line counter.')
251
self.endpoint = endpoint
253
def _flush_insert(self, start_linenum, end_linenum,
254
new_lines, out_lines, index_lines):
255
"""Add an 'insert' request to the data stream."""
256
bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
257
insert_length = len(bytes_to_insert)
258
# Each insert instruction is at most 127 bytes long
259
for start_byte in xrange(0, insert_length, 127):
260
insert_count = min(insert_length - start_byte, 127)
261
out_lines.append(chr(insert_count))
262
# Don't index the 'insert' instruction
263
index_lines.append(False)
264
insert = bytes_to_insert[start_byte:start_byte+insert_count]
265
as_lines = osutils.split_lines(insert)
266
out_lines.extend(as_lines)
267
index_lines.extend([True]*len(as_lines))
269
def _flush_copy(self, old_start_linenum, num_lines,
270
out_lines, index_lines):
271
if old_start_linenum == 0:
274
first_byte = self.line_offsets[old_start_linenum - 1]
275
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
276
num_bytes = stop_byte - first_byte
277
# The data stream allows >64kB in a copy, but to match the compiled
278
# code, we will also limit it to a 64kB copy
279
for start_byte in xrange(first_byte, stop_byte, 64*1024):
280
num_bytes = min(64*1024, stop_byte - start_byte)
281
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
282
out_lines.append(copy_bytes)
283
index_lines.append(False)
285
def make_delta(self, new_lines, bytes_length=None, soft=False):
286
"""Compute the delta for this content versus the original content."""
287
if bytes_length is None:
288
bytes_length = sum(map(len, new_lines))
289
# reserved for content type, content length
290
out_lines = ['', '', encode_base128_int(bytes_length)]
291
index_lines = [False, False, False]
292
output_handler = _OutputHandler(out_lines, index_lines,
293
self._MIN_MATCH_BYTES)
294
blocks = self.get_matching_blocks(new_lines, soft=soft)
296
# We either copy a range (while there are reusable lines) or we
297
# insert new lines. To find reusable lines we traverse
298
for old_start, new_start, range_len in blocks:
299
if new_start != current_line_num:
300
# non-matching region, insert the content
301
output_handler.add_insert(new_lines[current_line_num:new_start])
302
current_line_num = new_start + range_len
304
# Convert the line based offsets into byte based offsets
308
first_byte = self.line_offsets[old_start - 1]
309
last_byte = self.line_offsets[old_start + range_len - 1]
310
output_handler.add_copy(first_byte, last_byte)
311
return out_lines, index_lines
314
def encode_base128_int(val):
315
"""Convert an integer into a 7-bit lsb encoding."""
319
bytes.append(chr((val | 0x80) & 0xFF))
321
bytes.append(chr(val))
322
return ''.join(bytes)
325
def decode_base128_int(bytes):
326
"""Decode an integer from a 7-bit lsb encoding."""
330
bval = ord(bytes[offset])
332
val |= (bval & 0x7F) << shift
335
bval = ord(bytes[offset])
341
def encode_copy_instruction(offset, length):
342
"""Convert this offset into a control code and bytes."""
346
for copy_bit in (0x01, 0x02, 0x04, 0x08):
347
base_byte = offset & 0xff
349
copy_command |= copy_bit
350
copy_bytes.append(chr(base_byte))
353
raise ValueError("cannot supply a length of None")
355
raise ValueError("we don't emit copy records for lengths > 64KiB")
357
raise ValueError("We cannot emit a copy of length 0")
358
if length != 0x10000:
359
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
360
# since that saves bytes for large chained copies
361
for copy_bit in (0x10, 0x20):
362
base_byte = length & 0xff
364
copy_command |= copy_bit
365
copy_bytes.append(chr(base_byte))
367
copy_bytes[0] = chr(copy_command)
368
return ''.join(copy_bytes)
371
def decode_copy_instruction(bytes, cmd, pos):
372
"""Decode a copy instruction from the next few bytes.
374
A copy instruction is a variable number of bytes, so we will parse the
375
bytes we care about, and return the new position, as well as the offset and
376
length referred to in the bytes.
378
:param bytes: A string of bytes
379
:param cmd: The command code
380
:param pos: The position in bytes right after the copy command
381
:return: (offset, length, newpos)
382
The offset of the copy start, the number of bytes to copy, and the
383
position after the last byte of the copy
385
if cmd & 0x80 != 0x80:
386
raise ValueError('copy instructions must have bit 0x80 set')
390
offset = ord(bytes[pos])
393
offset = offset | (ord(bytes[pos]) << 8)
396
offset = offset | (ord(bytes[pos]) << 16)
399
offset = offset | (ord(bytes[pos]) << 24)
402
length = ord(bytes[pos])
405
length = length | (ord(bytes[pos]) << 8)
408
length = length | (ord(bytes[pos]) << 16)
412
return (offset, length, pos)
415
def make_delta(source_bytes, target_bytes):
416
"""Create a delta from source to target."""
417
if type(source_bytes) is not str:
418
raise TypeError('source is not a str')
419
if type(target_bytes) is not str:
420
raise TypeError('target is not a str')
421
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
422
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
423
bytes_length=len(target_bytes))
424
return ''.join(delta)
427
def apply_delta(basis, delta):
428
"""Apply delta to this object to become new_version_id."""
429
if type(basis) is not str:
430
raise TypeError('basis is not a str')
431
if type(delta) is not str:
432
raise TypeError('delta is not a str')
433
target_length, pos = decode_base128_int(delta)
435
len_delta = len(delta)
436
while pos < len_delta:
437
cmd = ord(delta[pos])
440
offset, length, pos = decode_copy_instruction(delta, cmd, pos)
441
last = offset + length
442
if last > len(basis):
443
raise ValueError('data would copy bytes past the'
445
lines.append(basis[offset:last])
446
else: # Insert of 'cmd' bytes
448
raise ValueError('Command == 0 not supported yet')
449
lines.append(delta[pos:pos+cmd])
451
bytes = ''.join(lines)
452
if len(bytes) != target_length:
453
raise ValueError('Delta claimed to be %d long, but ended up'
454
' %d long' % (target_length, len(bytes)))
458
def apply_delta_to_source(source, delta_start, delta_end):
459
"""Extract a delta from source bytes, and apply it."""
460
source_size = len(source)
461
if delta_start >= source_size:
462
raise ValueError('delta starts after source')
463
if delta_end > source_size:
464
raise ValueError('delta ends after source')
465
if delta_start >= delta_end:
466
raise ValueError('delta starts after it ends')
467
delta_bytes = source[delta_start:delta_end]
468
return apply_delta(source, delta_bytes)