40
40
print next(), next()
41
41
for header in lines:
42
start, end, count = [int(n) for n in header.split(',')]
43
contents = [next() for i in xrange(count)]
44
result.append((start, end, count, contents))
44
numbers = [int(n) for n in header[2:].split(',')]
46
result.append((op, numbers[0], numbers[1], None))
48
contents = [next() for i in xrange(numbers[0])]
49
result.append((op, None, numbers[0], contents))
47
52
def apply_delta(basis, delta):
51
56
# eq ranges occur where gaps occur
52
57
# start, end refer to offsets in basis
53
for start, end, count, delta_lines in delta:
54
if last_offset != start: # copy an eq range
55
lines.extend(basis[last_offset:start])
56
lines[start:end] = delta_lines
58
if last_offset != len(basis):
59
lines.extend(basis[last_offset:])
58
for op, start, count, delta_lines in delta:
60
lines.extend(basis[start:start+count])
62
lines.extend(delta_lines)
60
63
trim_encoding_newline(lines)
71
74
class GroupCompressor(object):
72
"""Produce a serialised group of compressed texts."""
75
"""Produce a serialised group of compressed texts.
77
It contains code very similar to SequenceMatcher because of having a similar
78
task. However some key differences apply:
79
- there is no junk, we want a minimal edit not a human readable diff.
80
- we don't filter very common lines (because we don't know where a good
81
range will start, and after the first text we want to be emitting minmal
83
- we chain the left side, not the right side
84
- we incrementally update the adjacency matrix as new lines are provided.
85
- we look for matches in all of the left side, so the routine which does
86
the analagous task of find_longest_match does not need to filter on the
74
90
def __init__(self, delta=True):
75
91
"""Create a GroupCompressor.
105
123
new_lines.append('label: %s\n' % label)
106
124
new_lines.append('sha1: %s\n' % sha1)
108
delta_seq = diff.difflib.SequenceMatcher(
109
None, self.lines, lines)
111
delta_seq = patiencediff.PatienceSequenceMatcher(
112
None, self.lines, lines)
114
for op in delta_seq.get_opcodes():
117
diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))
118
for start, end, count, new in diff_hunks:
119
new_lines.append('%d,%d,%d\n' % (start, end, count))
120
new_lines.extend(new)
121
self.endpoint += sum(map(len, new_lines))
122
self.lines.extend(new_lines)
126
line_locations = self.line_locations
131
# We either copy a range (while there are reusable lines) or we
132
# insert new lines. To find reusable lines we traverse
133
while pos < len(lines):
135
if line not in line_locations:
138
copy_start = min(copy_ends) - copy_len
139
new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
147
locations, next = line_locations[line]
148
next_locations = locations.intersection(copy_ends)
149
if len(next_locations):
152
copy_ends = set(loc + 1 for loc in next_locations)
154
# range stops, flush and start a new copy range
155
copy_start = min(copy_ends) - copy_len
156
new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
162
new_lines.append("i,%d\n" % new_len)
163
new_lines.extend(lines[new_start:new_start+new_len])
166
copy_ends = line_locations[line][1]
170
copy_start = min(copy_ends) - copy_len
171
new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
173
new_lines.append("i,%d\n" % new_len)
174
new_lines.extend(lines[new_start:new_start+new_len])
176
self.output_lines(new_lines)
123
177
trim_encoding_newline(lines)
124
178
self.input_bytes += sum(map(len, lines))
125
179
return sha1, self.endpoint
181
def output_lines(self, new_lines):
182
self.endpoint += sum(map(len, new_lines))
183
offset = len(self.lines)
184
self.lines.extend(new_lines)
185
for pos, line in enumerate(new_lines):
186
indices, next_lines = self.line_locations.setdefault(line,
188
indices.add(pos + offset)
189
next_lines.add(pos + offset + 1)
128
192
"""Return the overall compression ratio."""
129
193
return float(self.input_bytes) / float(self.endpoint)
224
288
# double handling for now. Make it work until then.
225
289
bytes = ''.join(lines)
226
290
record = FulltextContentFactory(key, parents, None, bytes)
227
sha1 = self._insert_record_stream([record])
291
sha1 = self._insert_record_stream([record]).next()
228
292
return sha1, len(bytes), None
230
294
def _check_add(self, key, lines, random_id, check_content):