23
23
from bzrlib import osutils
26
class _OutputHandler(object):
27
"""A simple class which just tracks how to split up an insert request."""
29
def __init__(self, out_lines, index_lines, min_len_to_index):
30
self.out_lines = out_lines
31
self.index_lines = index_lines
32
self.min_len_to_index = min_len_to_index
33
self.cur_insert_lines = []
34
self.cur_insert_len = 0
36
def add_copy(self, start_byte, end_byte):
37
# The data stream allows >64kB in a copy, but to match the compiled
38
# code, we will also limit it to a 64kB copy
39
for start_byte in xrange(start_byte, end_byte, 64*1024):
40
num_bytes = min(64*1024, end_byte - start_byte)
41
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
42
self.out_lines.append(copy_bytes)
43
self.index_lines.append(False)
45
def _flush_insert(self):
46
if not self.cur_insert_lines:
48
if self.cur_insert_len > 127:
49
raise AssertionError('We cannot insert more than 127 bytes'
51
self.out_lines.append(chr(self.cur_insert_len))
52
self.index_lines.append(False)
53
self.out_lines.extend(self.cur_insert_lines)
54
if self.cur_insert_len < self.min_len_to_index:
55
self.index_lines.extend([False]*len(self.cur_insert_lines))
57
self.index_lines.extend([True]*len(self.cur_insert_lines))
58
self.cur_insert_lines = []
59
self.cur_insert_len = 0
61
def _insert_long_line(self, line):
62
# Flush out anything pending
65
for start_index in xrange(0, line_len, 127):
66
next_len = min(127, line_len - start_index)
67
self.out_lines.append(chr(next_len))
68
self.index_lines.append(False)
69
self.out_lines.append(line[start_index:start_index+next_len])
70
# We don't index long lines, because we won't be able to match
71
# a line split across multiple inserts anway
72
self.index_lines.append(False)
74
def add_insert(self, lines):
75
if self.cur_insert_lines != []:
76
raise AssertionError('self.cur_insert_lines must be empty when'
77
' adding a new insert')
80
self._insert_long_line(line)
82
next_len = len(line) + self.cur_insert_len
84
# Adding this line would overflow, so flush, and start over
86
self.cur_insert_lines = [line]
87
self.cur_insert_len = len(line)
89
self.cur_insert_lines.append(line)
90
self.cur_insert_len = next_len
26
94
class LinesDeltaIndex(object):
27
95
"""This class indexes matches between strings.
152
prev_locations = None
78
153
max_pos = len(lines)
154
matching = self._matching_lines
79
155
while pos < max_pos:
81
# TODO: is try/except better than get(..., None)?
83
locations = self._matching_lines[lines[pos]]
157
locations = matching[lines[pos]]
87
159
# No more matches, just return whatever we have, but we know
88
160
# that this last position is not going to match anything
164
if prev_locations is None:
165
# This is the first match in a range
166
prev_locations = locations
168
locations = None # Consumed
94
# This is the first match in a range
95
copy_ends = [loc + 1 for loc in locations]
170
# We have a match started, compare to see if any of the
171
# current matches can be continued
172
next_locations = locations.intersection([loc + 1 for loc
175
# At least one of the regions continues to match
176
prev_locations = set(next_locations)
97
178
locations = None # Consumed
99
# We have a match started, compare to see if any of the
100
# current matches can be continued
101
next_locations = set(copy_ends).intersection(locations)
103
# At least one of the regions continues to match
104
copy_ends = [loc + 1 for loc in next_locations]
106
locations = None # Consumed
108
# All current regions no longer match.
109
# This line does still match something, just not at the
110
# end of the previous matches. We will return locations
111
# so that we can avoid another _matching_lines lookup.
180
# All current regions no longer match.
181
# This line does still match something, just not at the
182
# end of the previous matches. We will return locations
183
# so that we can avoid another _matching_lines lookup.
114
if copy_ends is None:
186
if prev_locations is None:
115
187
# We have no matches, this is a pure insert
116
return None, pos, locations
117
return (((min(copy_ends) - range_len, range_start, range_len)),
189
smallest = min(prev_locations)
190
return (smallest - range_len + 1, range_start, range_len), pos
120
192
def get_matching_blocks(self, lines, soft=False):
121
193
"""Return the ranges in lines which match self.lines.
137
208
max_pos = len(lines)
138
209
result_append = result.append
210
min_match_bytes = self._MIN_MATCH_BYTES
141
min_match_bytes = 200
212
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
142
213
while pos < max_pos:
143
block, pos, locations = self._get_longest_match(lines, pos,
214
block, pos = self._get_longest_match(lines, pos)
145
215
if block is not None:
146
216
# Check to see if we match fewer than min_match_bytes. As we
147
217
# will turn this into a pure 'insert', rather than a copy.
217
287
# reserved for content type, content length
218
288
out_lines = ['', '', encode_base128_int(bytes_length)]
219
289
index_lines = [False, False, False]
290
output_handler = _OutputHandler(out_lines, index_lines,
291
self._MIN_MATCH_BYTES)
220
292
blocks = self.get_matching_blocks(new_lines, soft=soft)
221
293
current_line_num = 0
222
294
# We either copy a range (while there are reusable lines) or we
224
296
for old_start, new_start, range_len in blocks:
225
297
if new_start != current_line_num:
226
298
# non-matching region, insert the content
227
self._flush_insert(current_line_num, new_start,
228
new_lines, out_lines, index_lines)
299
output_handler.add_insert(new_lines[current_line_num:new_start])
229
300
current_line_num = new_start + range_len
231
self._flush_copy(old_start, range_len, out_lines, index_lines)
302
# Convert the line based offsets into byte based offsets
306
first_byte = self.line_offsets[old_start - 1]
307
last_byte = self.line_offsets[old_start + range_len - 1]
308
output_handler.add_copy(first_byte, last_byte)
232
309
return out_lines, index_lines