23
from __future__ import absolute_import
23
25
from bzrlib import osutils
28
class _OutputHandler(object):
29
"""A simple class which just tracks how to split up an insert request."""
31
def __init__(self, out_lines, index_lines, min_len_to_index):
32
self.out_lines = out_lines
33
self.index_lines = index_lines
34
self.min_len_to_index = min_len_to_index
35
self.cur_insert_lines = []
36
self.cur_insert_len = 0
38
def add_copy(self, start_byte, end_byte):
39
# The data stream allows >64kB in a copy, but to match the compiled
40
# code, we will also limit it to a 64kB copy
41
for start_byte in xrange(start_byte, end_byte, 64*1024):
42
num_bytes = min(64*1024, end_byte - start_byte)
43
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
44
self.out_lines.append(copy_bytes)
45
self.index_lines.append(False)
47
def _flush_insert(self):
48
if not self.cur_insert_lines:
50
if self.cur_insert_len > 127:
51
raise AssertionError('We cannot insert more than 127 bytes'
53
self.out_lines.append(chr(self.cur_insert_len))
54
self.index_lines.append(False)
55
self.out_lines.extend(self.cur_insert_lines)
56
if self.cur_insert_len < self.min_len_to_index:
57
self.index_lines.extend([False]*len(self.cur_insert_lines))
59
self.index_lines.extend([True]*len(self.cur_insert_lines))
60
self.cur_insert_lines = []
61
self.cur_insert_len = 0
63
def _insert_long_line(self, line):
64
# Flush out anything pending
67
for start_index in xrange(0, line_len, 127):
68
next_len = min(127, line_len - start_index)
69
self.out_lines.append(chr(next_len))
70
self.index_lines.append(False)
71
self.out_lines.append(line[start_index:start_index+next_len])
72
# We don't index long lines, because we won't be able to match
73
# a line split across multiple inserts anway
74
self.index_lines.append(False)
76
def add_insert(self, lines):
77
if self.cur_insert_lines != []:
78
raise AssertionError('self.cur_insert_lines must be empty when'
79
' adding a new insert')
82
self._insert_long_line(line)
84
next_len = len(line) + self.cur_insert_len
86
# Adding this line would overflow, so flush, and start over
88
self.cur_insert_lines = [line]
89
self.cur_insert_len = len(line)
91
self.cur_insert_lines.append(line)
92
self.cur_insert_len = next_len
26
96
class LinesDeltaIndex(object):
27
97
"""This class indexes matches between strings.
154
prev_locations = None
78
155
max_pos = len(lines)
156
matching = self._matching_lines
79
157
while pos < max_pos:
81
# TODO: is try/except better than get(..., None)?
83
locations = self._matching_lines[lines[pos]]
159
locations = matching[lines[pos]]
87
161
# No more matches, just return whatever we have, but we know
88
162
# that this last position is not going to match anything
166
if prev_locations is None:
167
# This is the first match in a range
168
prev_locations = locations
170
locations = None # Consumed
94
# This is the first match in a range
95
copy_ends = [loc + 1 for loc in locations]
172
# We have a match started, compare to see if any of the
173
# current matches can be continued
174
next_locations = locations.intersection([loc + 1 for loc
177
# At least one of the regions continues to match
178
prev_locations = set(next_locations)
97
180
locations = None # Consumed
99
# We have a match started, compare to see if any of the
100
# current matches can be continued
101
next_locations = set(copy_ends).intersection(locations)
103
# At least one of the regions continues to match
104
copy_ends = [loc + 1 for loc in next_locations]
106
locations = None # Consumed
108
# All current regions no longer match.
109
# This line does still match something, just not at the
110
# end of the previous matches. We will return locations
111
# so that we can avoid another _matching_lines lookup.
182
# All current regions no longer match.
183
# This line does still match something, just not at the
184
# end of the previous matches. We will return locations
185
# so that we can avoid another _matching_lines lookup.
114
if copy_ends is None:
188
if prev_locations is None:
115
189
# We have no matches, this is a pure insert
116
return None, pos, locations
117
return (((min(copy_ends) - range_len, range_start, range_len)),
191
smallest = min(prev_locations)
192
return (smallest - range_len + 1, range_start, range_len), pos
120
194
def get_matching_blocks(self, lines, soft=False):
121
195
"""Return the ranges in lines which match self.lines.
137
210
max_pos = len(lines)
138
211
result_append = result.append
212
min_match_bytes = self._MIN_MATCH_BYTES
141
min_match_bytes = 200
214
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
142
215
while pos < max_pos:
143
block, pos, locations = self._get_longest_match(lines, pos,
216
block, pos = self._get_longest_match(lines, pos)
145
217
if block is not None:
146
218
# Check to see if we match fewer than min_match_bytes. As we
147
219
# will turn this into a pure 'insert', rather than a copy.
205
277
# The data stream allows >64kB in a copy, but to match the compiled
206
278
# code, we will also limit it to a 64kB copy
207
279
for start_byte in xrange(first_byte, stop_byte, 64*1024):
208
num_bytes = min(64*1024, stop_byte - first_byte)
280
num_bytes = min(64*1024, stop_byte - start_byte)
209
281
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
210
282
out_lines.append(copy_bytes)
211
283
index_lines.append(False)
217
289
# reserved for content type, content length
218
290
out_lines = ['', '', encode_base128_int(bytes_length)]
219
291
index_lines = [False, False, False]
292
output_handler = _OutputHandler(out_lines, index_lines,
293
self._MIN_MATCH_BYTES)
220
294
blocks = self.get_matching_blocks(new_lines, soft=soft)
221
295
current_line_num = 0
222
296
# We either copy a range (while there are reusable lines) or we
224
298
for old_start, new_start, range_len in blocks:
225
299
if new_start != current_line_num:
226
300
# non-matching region, insert the content
227
self._flush_insert(current_line_num, new_start,
228
new_lines, out_lines, index_lines)
301
output_handler.add_insert(new_lines[current_line_num:new_start])
229
302
current_line_num = new_start + range_len
231
self._flush_copy(old_start, range_len, out_lines, index_lines)
304
# Convert the line based offsets into byte based offsets
308
first_byte = self.line_offsets[old_start - 1]
309
last_byte = self.line_offsets[old_start + range_len - 1]
310
output_handler.add_copy(first_byte, last_byte)
232
311
return out_lines, index_lines