4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1 |
# Copyright (C) 2009 Canonical Ltd
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
||
17 |
"""Python version of compiled extensions for doing compression.
|
|
18 |
||
19 |
We separate the implementation from the groupcompress.py to avoid importing
|
|
20 |
useless stuff.
|
|
21 |
"""
|
|
22 |
||
6379.6.7
by Jelmer Vernooij
Move importing from future until after doc string, otherwise the doc string will disappear. |
23 |
from __future__ import absolute_import |
24 |
||
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
25 |
from bzrlib import osutils |
26 |
||
27 |
||
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
28 |
class _OutputHandler(object): |
29 |
"""A simple class which just tracks how to split up an insert request."""
|
|
30 |
||
31 |
def __init__(self, out_lines, index_lines, min_len_to_index): |
|
32 |
self.out_lines = out_lines |
|
33 |
self.index_lines = index_lines |
|
34 |
self.min_len_to_index = min_len_to_index |
|
35 |
self.cur_insert_lines = [] |
|
36 |
self.cur_insert_len = 0 |
|
37 |
||
38 |
def add_copy(self, start_byte, end_byte): |
|
39 |
# The data stream allows >64kB in a copy, but to match the compiled
|
|
40 |
# code, we will also limit it to a 64kB copy
|
|
41 |
for start_byte in xrange(start_byte, end_byte, 64*1024): |
|
42 |
num_bytes = min(64*1024, end_byte - start_byte) |
|
43 |
copy_bytes = encode_copy_instruction(start_byte, num_bytes) |
|
44 |
self.out_lines.append(copy_bytes) |
|
45 |
self.index_lines.append(False) |
|
46 |
||
47 |
def _flush_insert(self): |
|
48 |
if not self.cur_insert_lines: |
|
49 |
return
|
|
4300.1.7
by John Arbash Meinel
Bring in the other test cases. |
50 |
if self.cur_insert_len > 127: |
51 |
raise AssertionError('We cannot insert more than 127 bytes' |
|
52 |
' at a time.') |
|
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
53 |
self.out_lines.append(chr(self.cur_insert_len)) |
54 |
self.index_lines.append(False) |
|
55 |
self.out_lines.extend(self.cur_insert_lines) |
|
56 |
if self.cur_insert_len < self.min_len_to_index: |
|
57 |
self.index_lines.extend([False]*len(self.cur_insert_lines)) |
|
58 |
else: |
|
59 |
self.index_lines.extend([True]*len(self.cur_insert_lines)) |
|
60 |
self.cur_insert_lines = [] |
|
61 |
self.cur_insert_len = 0 |
|
62 |
||
63 |
def _insert_long_line(self, line): |
|
64 |
# Flush out anything pending
|
|
65 |
self._flush_insert() |
|
66 |
line_len = len(line) |
|
67 |
for start_index in xrange(0, line_len, 127): |
|
68 |
next_len = min(127, line_len - start_index) |
|
69 |
self.out_lines.append(chr(next_len)) |
|
70 |
self.index_lines.append(False) |
|
71 |
self.out_lines.append(line[start_index:start_index+next_len]) |
|
72 |
# We don't index long lines, because we won't be able to match
|
|
73 |
# a line split across multiple inserts anway
|
|
74 |
self.index_lines.append(False) |
|
75 |
||
76 |
def add_insert(self, lines): |
|
4300.1.7
by John Arbash Meinel
Bring in the other test cases. |
77 |
if self.cur_insert_lines != []: |
78 |
raise AssertionError('self.cur_insert_lines must be empty when' |
|
79 |
' adding a new insert') |
|
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
80 |
for line in lines: |
81 |
if len(line) > 127: |
|
82 |
self._insert_long_line(line) |
|
83 |
else: |
|
84 |
next_len = len(line) + self.cur_insert_len |
|
85 |
if next_len > 127: |
|
86 |
# Adding this line would overflow, so flush, and start over
|
|
87 |
self._flush_insert() |
|
88 |
self.cur_insert_lines = [line] |
|
89 |
self.cur_insert_len = len(line) |
|
90 |
else: |
|
91 |
self.cur_insert_lines.append(line) |
|
92 |
self.cur_insert_len = next_len |
|
93 |
self._flush_insert() |
|
94 |
||
95 |
||
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
96 |
class LinesDeltaIndex(object): |
97 |
"""This class indexes matches between strings.
|
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
98 |
|
99 |
:ivar lines: The 'static' lines that will be preserved between runs.
|
|
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
100 |
:ivar _matching_lines: A dict of {line:[matching offsets]}
|
101 |
:ivar line_offsets: The byte offset for the end of each line, used to
|
|
102 |
quickly map between a matching line number and the byte location
|
|
103 |
:ivar endpoint: The total number of bytes in self.line_offsets
|
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
104 |
"""
|
105 |
||
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
106 |
_MIN_MATCH_BYTES = 10 |
107 |
_SOFT_MIN_MATCH_BYTES = 200 |
|
108 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
109 |
def __init__(self, lines): |
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
110 |
self.lines = [] |
3735.40.6
by John Arbash Meinel
Move more information down into EquivalenceTable. |
111 |
self.line_offsets = [] |
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
112 |
self.endpoint = 0 |
113 |
self._matching_lines = {} |
|
114 |
self.extend_lines(lines, [True]*len(lines)) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
115 |
|
116 |
def _update_matching_lines(self, new_lines, index): |
|
117 |
matches = self._matching_lines |
|
118 |
start_idx = len(self.lines) |
|
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
119 |
if len(new_lines) != len(index): |
120 |
raise AssertionError('The number of lines to be indexed does' |
|
121 |
' not match the index/don\'t index flags: %d != %d' |
|
122 |
% (len(new_lines), len(index))) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
123 |
for idx, do_index in enumerate(index): |
124 |
if not do_index: |
|
125 |
continue
|
|
4300.1.4
by John Arbash Meinel
Change self._matching_lines to use a set rather than a list. |
126 |
line = new_lines[idx] |
127 |
try: |
|
128 |
matches[line].add(start_idx + idx) |
|
129 |
except KeyError: |
|
130 |
matches[line] = set([start_idx + idx]) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
131 |
|
132 |
def get_matches(self, line): |
|
133 |
"""Return the lines which match the line in right."""
|
|
134 |
try: |
|
135 |
return self._matching_lines[line] |
|
136 |
except KeyError: |
|
137 |
return None |
|
138 |
||
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
139 |
def _get_longest_match(self, lines, pos): |
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
140 |
"""Look at all matches for the current line, return the longest.
|
141 |
||
142 |
:param lines: The lines we are matching against
|
|
143 |
:param pos: The current location we care about
|
|
144 |
:param locations: A list of lines that matched the current location.
|
|
145 |
This may be None, but often we'll have already found matches for
|
|
146 |
this line.
|
|
147 |
:return: (start_in_self, start_in_lines, num_lines)
|
|
148 |
All values are the offset in the list (aka the line number)
|
|
149 |
If start_in_self is None, then we have no matches, and this line
|
|
150 |
should be inserted in the target.
|
|
151 |
"""
|
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
152 |
range_start = pos |
153 |
range_len = 0 |
|
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
154 |
prev_locations = None |
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
155 |
max_pos = len(lines) |
4300.1.4
by John Arbash Meinel
Change self._matching_lines to use a set rather than a list. |
156 |
matching = self._matching_lines |
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
157 |
while pos < max_pos: |
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
158 |
try: |
159 |
locations = matching[lines[pos]] |
|
160 |
except KeyError: |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
161 |
# No more matches, just return whatever we have, but we know
|
162 |
# that this last position is not going to match anything
|
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
163 |
pos += 1 |
164 |
break
|
|
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
165 |
# We have a match
|
166 |
if prev_locations is None: |
|
167 |
# This is the first match in a range
|
|
168 |
prev_locations = locations |
|
169 |
range_len = 1 |
|
170 |
locations = None # Consumed |
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
171 |
else: |
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
172 |
# We have a match started, compare to see if any of the
|
173 |
# current matches can be continued
|
|
174 |
next_locations = locations.intersection([loc + 1 for loc |
|
175 |
in prev_locations]) |
|
176 |
if next_locations: |
|
177 |
# At least one of the regions continues to match
|
|
178 |
prev_locations = set(next_locations) |
|
179 |
range_len += 1 |
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
180 |
locations = None # Consumed |
181 |
else: |
|
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
182 |
# All current regions no longer match.
|
183 |
# This line does still match something, just not at the
|
|
184 |
# end of the previous matches. We will return locations
|
|
185 |
# so that we can avoid another _matching_lines lookup.
|
|
186 |
break
|
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
187 |
pos += 1 |
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
188 |
if prev_locations is None: |
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
189 |
# We have no matches, this is a pure insert
|
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
190 |
return None, pos |
191 |
smallest = min(prev_locations) |
|
192 |
return (smallest - range_len + 1, range_start, range_len), pos |
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
193 |
|
194 |
def get_matching_blocks(self, lines, soft=False): |
|
195 |
"""Return the ranges in lines which match self.lines.
|
|
196 |
||
197 |
:param lines: lines to compress
|
|
198 |
:return: A list of (old_start, new_start, length) tuples which reflect
|
|
199 |
a region in self.lines that is present in lines. The last element
|
|
200 |
of the list is always (old_len, new_len, 0) to provide a end point
|
|
201 |
for generating instructions from the matching blocks list.
|
|
202 |
"""
|
|
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
203 |
# In this code, we iterate over multiple _get_longest_match calls, to
|
204 |
# find the next longest copy, and possible insert regions. We then
|
|
205 |
# convert that to the simple matching_blocks representation, since
|
|
206 |
# otherwise inserting 10 lines in a row would show up as 10
|
|
207 |
# instructions.
|
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
208 |
result = [] |
209 |
pos = 0 |
|
210 |
max_pos = len(lines) |
|
211 |
result_append = result.append |
|
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
212 |
min_match_bytes = self._MIN_MATCH_BYTES |
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
213 |
if soft: |
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
214 |
min_match_bytes = self._SOFT_MIN_MATCH_BYTES |
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
215 |
while pos < max_pos: |
4300.1.5
by John Arbash Meinel
A couple more cleanups on the pure-python implementation. |
216 |
block, pos = self._get_longest_match(lines, pos) |
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
217 |
if block is not None: |
3735.40.14
by John Arbash Meinel
Get rid of the self._right_lines state. It doesn't matter anymore. |
218 |
# Check to see if we match fewer than min_match_bytes. As we
|
219 |
# will turn this into a pure 'insert', rather than a copy.
|
|
220 |
# block[-1] is the number of lines. A quick check says if we
|
|
221 |
# have more lines than min_match_bytes, then we know we have
|
|
222 |
# enough bytes.
|
|
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
223 |
if block[-1] < min_match_bytes: |
224 |
# This block may be a 'short' block, check
|
|
225 |
old_start, new_start, range_len = block |
|
226 |
matched_bytes = sum(map(len, |
|
227 |
lines[new_start:new_start + range_len])) |
|
228 |
if matched_bytes < min_match_bytes: |
|
229 |
block = None |
|
230 |
if block is not None: |
|
231 |
result_append(block) |
|
232 |
result_append((len(self.lines), len(lines), 0)) |
|
233 |
return result |
|
234 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
235 |
def extend_lines(self, lines, index): |
236 |
"""Add more lines to the left-lines list.
|
|
237 |
||
238 |
:param lines: A list of lines to add
|
|
239 |
:param index: A True/False for each node to define if it should be
|
|
240 |
indexed.
|
|
241 |
"""
|
|
242 |
self._update_matching_lines(lines, index) |
|
243 |
self.lines.extend(lines) |
|
3735.40.6
by John Arbash Meinel
Move more information down into EquivalenceTable. |
244 |
endpoint = self.endpoint |
245 |
for line in lines: |
|
246 |
endpoint += len(line) |
|
247 |
self.line_offsets.append(endpoint) |
|
3735.40.15
by John Arbash Meinel
Some cleanup passes over the LinesDeltaIndex code. |
248 |
if len(self.line_offsets) != len(self.lines): |
249 |
raise AssertionError('Somehow the line offset indicator' |
|
250 |
' got out of sync with the line counter.') |
|
3735.40.6
by John Arbash Meinel
Move more information down into EquivalenceTable. |
251 |
self.endpoint = endpoint |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
252 |
|
253 |
def _flush_insert(self, start_linenum, end_linenum, |
|
254 |
new_lines, out_lines, index_lines): |
|
255 |
"""Add an 'insert' request to the data stream."""
|
|
256 |
bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum]) |
|
257 |
insert_length = len(bytes_to_insert) |
|
258 |
# Each insert instruction is at most 127 bytes long
|
|
259 |
for start_byte in xrange(0, insert_length, 127): |
|
260 |
insert_count = min(insert_length - start_byte, 127) |
|
261 |
out_lines.append(chr(insert_count)) |
|
262 |
# Don't index the 'insert' instruction
|
|
263 |
index_lines.append(False) |
|
264 |
insert = bytes_to_insert[start_byte:start_byte+insert_count] |
|
265 |
as_lines = osutils.split_lines(insert) |
|
266 |
out_lines.extend(as_lines) |
|
267 |
index_lines.extend([True]*len(as_lines)) |
|
268 |
||
269 |
def _flush_copy(self, old_start_linenum, num_lines, |
|
270 |
out_lines, index_lines): |
|
271 |
if old_start_linenum == 0: |
|
272 |
first_byte = 0 |
|
273 |
else: |
|
274 |
first_byte = self.line_offsets[old_start_linenum - 1] |
|
275 |
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1] |
|
276 |
num_bytes = stop_byte - first_byte |
|
277 |
# The data stream allows >64kB in a copy, but to match the compiled
|
|
278 |
# code, we will also limit it to a 64kB copy
|
|
279 |
for start_byte in xrange(first_byte, stop_byte, 64*1024): |
|
4241.20.1
by John Arbash Meinel
Cherrypick the python groupcompress fix for bzr-1.14-final |
280 |
num_bytes = min(64*1024, stop_byte - start_byte) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
281 |
copy_bytes = encode_copy_instruction(start_byte, num_bytes) |
282 |
out_lines.append(copy_bytes) |
|
283 |
index_lines.append(False) |
|
284 |
||
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
285 |
def make_delta(self, new_lines, bytes_length=None, soft=False): |
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
286 |
"""Compute the delta for this content versus the original content."""
|
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
287 |
if bytes_length is None: |
288 |
bytes_length = sum(map(len, new_lines)) |
|
289 |
# reserved for content type, content length
|
|
290 |
out_lines = ['', '', encode_base128_int(bytes_length)] |
|
3735.40.10
by John Arbash Meinel
Merge in the new delta format code. |
291 |
index_lines = [False, False, False] |
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
292 |
output_handler = _OutputHandler(out_lines, index_lines, |
293 |
self._MIN_MATCH_BYTES) |
|
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
294 |
blocks = self.get_matching_blocks(new_lines, soft=soft) |
295 |
current_line_num = 0 |
|
296 |
# We either copy a range (while there are reusable lines) or we
|
|
297 |
# insert new lines. To find reusable lines we traverse
|
|
298 |
for old_start, new_start, range_len in blocks: |
|
299 |
if new_start != current_line_num: |
|
300 |
# non-matching region, insert the content
|
|
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
301 |
output_handler.add_insert(new_lines[current_line_num:new_start]) |
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
302 |
current_line_num = new_start + range_len |
303 |
if range_len: |
|
4300.1.2
by John Arbash Meinel
Change the pure-python compressor a bit. |
304 |
# Convert the line based offsets into byte based offsets
|
305 |
if old_start == 0: |
|
306 |
first_byte = 0 |
|
307 |
else: |
|
308 |
first_byte = self.line_offsets[old_start - 1] |
|
309 |
last_byte = self.line_offsets[old_start + range_len - 1] |
|
310 |
output_handler.add_copy(first_byte, last_byte) |
|
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
311 |
return out_lines, index_lines |
312 |
||
313 |
||
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
314 |
def encode_base128_int(val): |
315 |
"""Convert an integer into a 7-bit lsb encoding."""
|
|
316 |
bytes = [] |
|
317 |
count = 0 |
|
318 |
while val >= 0x80: |
|
319 |
bytes.append(chr((val | 0x80) & 0xFF)) |
|
320 |
val >>= 7 |
|
321 |
bytes.append(chr(val)) |
|
322 |
return ''.join(bytes) |
|
323 |
||
324 |
||
325 |
def decode_base128_int(bytes): |
|
326 |
"""Decode an integer from a 7-bit lsb encoding."""
|
|
327 |
offset = 0 |
|
328 |
val = 0 |
|
329 |
shift = 0 |
|
330 |
bval = ord(bytes[offset]) |
|
331 |
while bval >= 0x80: |
|
332 |
val |= (bval & 0x7F) << shift |
|
333 |
shift += 7 |
|
334 |
offset += 1 |
|
335 |
bval = ord(bytes[offset]) |
|
336 |
val |= bval << shift |
|
337 |
offset += 1 |
|
338 |
return val, offset |
|
339 |
||
340 |
||
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
341 |
def encode_copy_instruction(offset, length): |
342 |
"""Convert this offset into a control code and bytes."""
|
|
343 |
copy_command = 0x80 |
|
344 |
copy_bytes = [None] |
|
345 |
||
346 |
for copy_bit in (0x01, 0x02, 0x04, 0x08): |
|
347 |
base_byte = offset & 0xff |
|
348 |
if base_byte: |
|
349 |
copy_command |= copy_bit |
|
350 |
copy_bytes.append(chr(base_byte)) |
|
351 |
offset >>= 8 |
|
352 |
if length is None: |
|
4300.2.1
by John Arbash Meinel
Fix bug #364900, properly remove the 64kB that was just encoded in the copy. |
353 |
raise ValueError("cannot supply a length of None") |
3735.40.7
by John Arbash Meinel
Move even more functionality into EquivalenceTable. |
354 |
if length > 0x10000: |
355 |
raise ValueError("we don't emit copy records for lengths > 64KiB") |
|
356 |
if length == 0: |
|
357 |
raise ValueError("We cannot emit a copy of length 0") |
|
358 |
if length != 0x10000: |
|
359 |
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
|
|
360 |
# since that saves bytes for large chained copies
|
|
361 |
for copy_bit in (0x10, 0x20): |
|
362 |
base_byte = length & 0xff |
|
363 |
if base_byte: |
|
364 |
copy_command |= copy_bit |
|
365 |
copy_bytes.append(chr(base_byte)) |
|
366 |
length >>= 8 |
|
367 |
copy_bytes[0] = chr(copy_command) |
|
368 |
return ''.join(copy_bytes) |
|
369 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
370 |
|
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
371 |
def decode_copy_instruction(bytes, cmd, pos): |
372 |
"""Decode a copy instruction from the next few bytes.
|
|
373 |
||
374 |
A copy instruction is a variable number of bytes, so we will parse the
|
|
375 |
bytes we care about, and return the new position, as well as the offset and
|
|
376 |
length referred to in the bytes.
|
|
377 |
||
378 |
:param bytes: A string of bytes
|
|
379 |
:param cmd: The command code
|
|
380 |
:param pos: The position in bytes right after the copy command
|
|
381 |
:return: (offset, length, newpos)
|
|
382 |
The offset of the copy start, the number of bytes to copy, and the
|
|
383 |
position after the last byte of the copy
|
|
384 |
"""
|
|
385 |
if cmd & 0x80 != 0x80: |
|
386 |
raise ValueError('copy instructions must have bit 0x80 set') |
|
387 |
offset = 0 |
|
388 |
length = 0 |
|
389 |
if (cmd & 0x01): |
|
390 |
offset = ord(bytes[pos]) |
|
391 |
pos += 1 |
|
392 |
if (cmd & 0x02): |
|
393 |
offset = offset | (ord(bytes[pos]) << 8) |
|
394 |
pos += 1 |
|
395 |
if (cmd & 0x04): |
|
396 |
offset = offset | (ord(bytes[pos]) << 16) |
|
397 |
pos += 1 |
|
398 |
if (cmd & 0x08): |
|
399 |
offset = offset | (ord(bytes[pos]) << 24) |
|
400 |
pos += 1 |
|
401 |
if (cmd & 0x10): |
|
402 |
length = ord(bytes[pos]) |
|
403 |
pos += 1 |
|
404 |
if (cmd & 0x20): |
|
405 |
length = length | (ord(bytes[pos]) << 8) |
|
406 |
pos += 1 |
|
407 |
if (cmd & 0x40): |
|
408 |
length = length | (ord(bytes[pos]) << 16) |
|
409 |
pos += 1 |
|
410 |
if length == 0: |
|
411 |
length = 65536 |
|
412 |
return (offset, length, pos) |
|
413 |
||
3735.40.5
by John Arbash Meinel
Start adding permutation tests for _groupcompress_py and _groupcompress_pyx |
414 |
|
415 |
def make_delta(source_bytes, target_bytes): |
|
416 |
"""Create a delta from source to target."""
|
|
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
417 |
if type(source_bytes) is not str: |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
418 |
raise TypeError('source is not a str') |
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
419 |
if type(target_bytes) is not str: |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
420 |
raise TypeError('target is not a str') |
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
421 |
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes)) |
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
422 |
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes), |
423 |
bytes_length=len(target_bytes)) |
|
424 |
return ''.join(delta) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
425 |
|
426 |
||
427 |
def apply_delta(basis, delta): |
|
428 |
"""Apply delta to this object to become new_version_id."""
|
|
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
429 |
if type(basis) is not str: |
430 |
raise TypeError('basis is not a str') |
|
431 |
if type(delta) is not str: |
|
432 |
raise TypeError('delta is not a str') |
|
433 |
target_length, pos = decode_base128_int(delta) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
434 |
lines = [] |
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
435 |
len_delta = len(delta) |
436 |
while pos < len_delta: |
|
437 |
cmd = ord(delta[pos]) |
|
438 |
pos += 1 |
|
439 |
if cmd & 0x80: |
|
440 |
offset, length, pos = decode_copy_instruction(delta, cmd, pos) |
|
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
441 |
last = offset + length |
442 |
if last > len(basis): |
|
443 |
raise ValueError('data would copy bytes past the' |
|
444 |
'end of source') |
|
445 |
lines.append(basis[offset:last]) |
|
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
446 |
else: # Insert of 'cmd' bytes |
447 |
if cmd == 0: |
|
448 |
raise ValueError('Command == 0 not supported yet') |
|
449 |
lines.append(delta[pos:pos+cmd]) |
|
450 |
pos += cmd |
|
451 |
bytes = ''.join(lines) |
|
452 |
if len(bytes) != target_length: |
|
453 |
raise ValueError('Delta claimed to be %d long, but ended up' |
|
454 |
' %d long' % (target_length, len(bytes))) |
|
455 |
return bytes |
|
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
456 |
|
457 |
||
458 |
def apply_delta_to_source(source, delta_start, delta_end): |
|
459 |
"""Extract a delta from source bytes, and apply it."""
|
|
460 |
source_size = len(source) |
|
461 |
if delta_start >= source_size: |
|
462 |
raise ValueError('delta starts after source') |
|
463 |
if delta_end > source_size: |
|
464 |
raise ValueError('delta ends after source') |
|
465 |
if delta_start >= delta_end: |
|
466 |
raise ValueError('delta starts after it ends') |
|
467 |
delta_bytes = source[delta_start:delta_end] |
|
468 |
return apply_delta(source, delta_bytes) |