23
23
from bzrlib import osutils
26
class _OutputHandler(object):
27
"""A simple class which just tracks how to split up an insert request."""
29
def __init__(self, out_lines, index_lines, min_len_to_index):
30
self.out_lines = out_lines
31
self.index_lines = index_lines
32
self.min_len_to_index = min_len_to_index
33
self.cur_insert_lines = []
34
self.cur_insert_len = 0
36
def add_copy(self, start_byte, end_byte):
37
# The data stream allows >64kB in a copy, but to match the compiled
38
# code, we will also limit it to a 64kB copy
39
for start_byte in xrange(start_byte, end_byte, 64*1024):
40
num_bytes = min(64*1024, end_byte - start_byte)
41
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
42
self.out_lines.append(copy_bytes)
43
self.index_lines.append(False)
45
def _flush_insert(self):
46
if not self.cur_insert_lines:
48
if self.cur_insert_len > 127:
49
raise AssertionError('We cannot insert more than 127 bytes'
51
self.out_lines.append(chr(self.cur_insert_len))
52
self.index_lines.append(False)
53
self.out_lines.extend(self.cur_insert_lines)
54
if self.cur_insert_len < self.min_len_to_index:
55
self.index_lines.extend([False]*len(self.cur_insert_lines))
57
self.index_lines.extend([True]*len(self.cur_insert_lines))
58
self.cur_insert_lines = []
59
self.cur_insert_len = 0
61
def _insert_long_line(self, line):
62
# Flush out anything pending
65
for start_index in xrange(0, line_len, 127):
66
next_len = min(127, line_len - start_index)
67
self.out_lines.append(chr(next_len))
68
self.index_lines.append(False)
69
self.out_lines.append(line[start_index:start_index+next_len])
70
# We don't index long lines, because we won't be able to match
71
# a line split across multiple inserts anway
72
self.index_lines.append(False)
74
def add_insert(self, lines):
75
if self.cur_insert_lines != []:
76
raise AssertionError('self.cur_insert_lines must be empty when'
77
' adding a new insert')
80
self._insert_long_line(line)
82
next_len = len(line) + self.cur_insert_len
84
# Adding this line would overflow, so flush, and start over
86
self.cur_insert_lines = [line]
87
self.cur_insert_len = len(line)
89
self.cur_insert_lines.append(line)
90
self.cur_insert_len = next_len
94
26
class LinesDeltaIndex(object):
95
27
"""This class indexes matches between strings.
152
prev_locations = None
153
78
max_pos = len(lines)
154
matching = self._matching_lines
155
79
while pos < max_pos:
157
locations = matching[lines[pos]]
81
# TODO: is try/except better than get(..., None)?
83
locations = self._matching_lines[lines[pos]]
159
87
# No more matches, just return whatever we have, but we know
160
88
# that this last position is not going to match anything
164
if prev_locations is None:
165
# This is the first match in a range
166
prev_locations = locations
168
locations = None # Consumed
170
# We have a match started, compare to see if any of the
171
# current matches can be continued
172
next_locations = locations.intersection([loc + 1 for loc
175
# At least one of the regions continues to match
176
prev_locations = set(next_locations)
94
# This is the first match in a range
95
copy_ends = [loc + 1 for loc in locations]
178
97
locations = None # Consumed
180
# All current regions no longer match.
181
# This line does still match something, just not at the
182
# end of the previous matches. We will return locations
183
# so that we can avoid another _matching_lines lookup.
99
# We have a match started, compare to see if any of the
100
# current matches can be continued
101
next_locations = set(copy_ends).intersection(locations)
103
# At least one of the regions continues to match
104
copy_ends = [loc + 1 for loc in next_locations]
106
locations = None # Consumed
108
# All current regions no longer match.
109
# This line does still match something, just not at the
110
# end of the previous matches. We will return locations
111
# so that we can avoid another _matching_lines lookup.
186
if prev_locations is None:
114
if copy_ends is None:
187
115
# We have no matches, this is a pure insert
189
smallest = min(prev_locations)
190
return (smallest - range_len + 1, range_start, range_len), pos
116
return None, pos, locations
117
return (((min(copy_ends) - range_len, range_start, range_len)),
192
120
def get_matching_blocks(self, lines, soft=False):
193
121
"""Return the ranges in lines which match self.lines.
208
137
max_pos = len(lines)
209
138
result_append = result.append
210
min_match_bytes = self._MIN_MATCH_BYTES
212
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
141
min_match_bytes = 200
213
142
while pos < max_pos:
214
block, pos = self._get_longest_match(lines, pos)
143
block, pos, locations = self._get_longest_match(lines, pos,
215
145
if block is not None:
216
146
# Check to see if we match fewer than min_match_bytes. As we
217
147
# will turn this into a pure 'insert', rather than a copy.
275
205
# The data stream allows >64kB in a copy, but to match the compiled
276
206
# code, we will also limit it to a 64kB copy
277
207
for start_byte in xrange(first_byte, stop_byte, 64*1024):
278
num_bytes = min(64*1024, stop_byte - start_byte)
208
num_bytes = min(64*1024, stop_byte - first_byte)
279
209
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
280
210
out_lines.append(copy_bytes)
281
211
index_lines.append(False)
287
217
# reserved for content type, content length
288
218
out_lines = ['', '', encode_base128_int(bytes_length)]
289
219
index_lines = [False, False, False]
290
output_handler = _OutputHandler(out_lines, index_lines,
291
self._MIN_MATCH_BYTES)
292
220
blocks = self.get_matching_blocks(new_lines, soft=soft)
293
221
current_line_num = 0
294
222
# We either copy a range (while there are reusable lines) or we
296
224
for old_start, new_start, range_len in blocks:
297
225
if new_start != current_line_num:
298
226
# non-matching region, insert the content
299
output_handler.add_insert(new_lines[current_line_num:new_start])
227
self._flush_insert(current_line_num, new_start,
228
new_lines, out_lines, index_lines)
300
229
current_line_num = new_start + range_len
302
# Convert the line based offsets into byte based offsets
306
first_byte = self.line_offsets[old_start - 1]
307
last_byte = self.line_offsets[old_start + range_len - 1]
308
output_handler.add_copy(first_byte, last_byte)
231
self._flush_copy(old_start, range_len, out_lines, index_lines)
309
232
return out_lines, index_lines