735
735
def __init__(self):
736
736
"""Create a GroupCompressor."""
737
# Consider seeding the lines with some sort of GC Start flag, or
738
# putting it as part of the output stream, rather than in the
741
739
self.endpoint = 0
742
740
self.input_bytes = 0
743
741
self.labels_deltas = {}
742
self._block = GroupCompressBlock()
744
def extract(self, key):
745
"""Extract a key previously added to the compressor.
747
:param key: The key to extract.
748
:return: An iterable over bytes and the sha1.
750
delta_details = self.labels_deltas[key]
751
delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
752
stored_bytes = ''.join(delta_chunks)
753
# TODO: Fix this, we shouldn't really be peeking here
754
entry = self._block._entries[key]
755
if entry.type == 'fulltext':
756
if stored_bytes[0] != 'f':
757
raise ValueError('Index claimed fulltext, but stored bytes'
758
' indicate %s' % (stored_bytes[0],))
759
fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
760
if fulltext_len + 1 + offset != len(stored_bytes):
761
raise ValueError('Index claimed fulltext len, but stored bytes'
763
% (len(stored_bytes),
764
fulltext_len + 1 + offset))
765
bytes = stored_bytes[offset + 1:]
767
if entry.type != 'delta':
768
raise ValueError('Unknown entry type: %s' % (entry.type,))
769
# XXX: This is inefficient at best
770
source = ''.join(self.lines)
771
if stored_bytes[0] != 'd':
772
raise ValueError('Entry type claims delta, bytes claim %s'
773
% (stored_bytes[0],))
774
delta_len, offset = decode_base128_int(stored_bytes[1:10])
775
if delta_len + 1 + offset != len(stored_bytes):
776
raise ValueError('Index claimed delta len, but stored bytes'
778
% (len(stored_bytes),
779
delta_len + 1 + offset))
780
bytes = apply_delta(source, stored_bytes[offset + 1:])
781
bytes_sha1 = osutils.sha_string(bytes)
782
if entry.sha1 != bytes_sha1:
783
raise ValueError('Recorded sha1 != measured %s != %s'
784
% (entry.sha1, bytes_sha1))
785
return bytes, entry.sha1
788
"""Call this if you want to 'revoke' the last compression.
790
After this, the data structures will be rolled back, but you cannot do
793
self._delta_index = None
794
del self.lines[self._last[0]:]
795
self.endpoint = self._last[1]
746
799
"""Return the overall compression ratio."""
750
803
class PythonGroupCompressor(_CommonGroupCompressor):
752
def __init__(self, delta=True):
753
806
"""Create a GroupCompressor.
755
808
:param delta: If False, do not compress records.
757
810
super(PythonGroupCompressor, self).__init__()
759
self.line_offsets = []
760
811
self.line_locations = EquivalenceTable([])
761
812
self.lines = self.line_locations.lines
762
813
self._present_prefixes = set()
764
def get_matching_blocks(self, lines, soft=False):
765
"""Return the ranges in lines which match self.lines.
767
:param lines: lines to compress
768
:return: A list of (old_start, new_start, length) tuples which reflect
769
a region in self.lines that is present in lines. The last element
770
of the list is always (old_len, new_len, 0) to provide a end point
771
for generating instructions from the matching blocks list.
775
line_locations = self.line_locations
776
line_locations.set_right_lines(lines)
777
# We either copy a range (while there are reusable lines) or we
778
# insert new lines. To find reusable lines we traverse
781
result_append = result.append
784
min_match_bytes = 200
786
block, pos, locations = _get_longest_match(line_locations, pos,
788
if block is not None:
789
# Check to see if we are matching fewer than 5 characters,
790
# which is turned into a simple 'insert', rather than a copy
791
# If we have more than 5 lines, we definitely have more than 5
793
if block[-1] < min_match_bytes:
794
# This block may be a 'short' block, check
795
old_start, new_start, range_len = block
796
matched_bytes = sum(map(len,
797
lines[new_start:new_start + range_len]))
798
if matched_bytes < min_match_bytes:
800
if block is not None:
802
result_append((len(self.lines), len(lines), 0))
805
815
# FIXME: implement nostore_sha
806
def compress(self, key, lines, expected_sha, nostore_sha=False, soft=False):
816
def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
807
817
"""Compress lines with label key.
809
819
:param key: A key tuple. It is stored in the output
810
820
for identification of the text during decompression. If the last
811
821
element is 'None' it is replaced with the sha1 of the text -
812
822
e.g. sha1:xxxxxxx.
813
:param lines: The lines to be compressed. Must be split
814
on \n, with the \n preserved.'
823
:param bytes: The bytes to be compressed
815
824
:param expected_sha: If non-None, the sha the lines are believed to
816
825
have. During compression the sha is calculated; a mismatch will
827
:param nostore_sha: If the computed sha1 sum matches, we will raise
828
ExistingContent rather than adding the text.
818
829
:param soft: Do a 'soft' compression. This means that we require larger
819
830
ranges to match to be considered for a copy command.
820
831
:return: The sha1 of lines, and the number of bytes accumulated in
821
832
the group output so far.
833
:seealso VersionedFiles.add_lines:
823
lines = osutils.split_lines(lines)
824
sha1 = osutils.sha_strings(lines)
835
if not bytes: # empty, like a dir entry, etc
836
if nostore_sha == _null_sha1:
837
raise errors.ExistingContent()
838
self._block.add_entry(key, type='empty',
841
return _null_sha1, 0, 0, 'fulltext', 0
842
bytes_length = len(bytes)
843
new_lines = osutils.split_lines(bytes)
844
sha1 = osutils.sha_string(bytes)
845
if sha1 == nostore_sha:
846
raise errors.ExistingContent()
825
847
if key[-1] is None:
826
848
key = key[:-1] + ('sha1:' + sha1,)
827
label = '\x00'.join(key)
829
new_lines = ['label: %s\n' % label,
833
index_lines = [False, False]
834
# setup good encoding for trailing \n support.
835
if not lines or lines[-1].endswith('\n'):
849
out_lines, index_lines = self.line_locations.make_delta(new_lines,
851
delta_length = sum(map(len, out_lines))
852
if delta_length * 2 > bytes_length:
853
# The delta is longer than the fulltext, insert a fulltext
855
out_lines = ['f', encode_base128_int(bytes_length)]
856
out_lines.extend(new_lines)
857
index_lines = [False, False]
858
index_lines.extend([True] * len(new_lines))
859
out_length = len(out_lines[1]) + bytes_length + 1
838
lines[-1] = lines[-1] + '\n'
842
flush_range = self.flush_range
844
blocks = self.get_matching_blocks(lines, soft=soft)
846
#copies_without_insertion = []
847
# We either copy a range (while there are reusable lines) or we
848
# insert new lines. To find reusable lines we traverse
849
for old_start, new_start, range_len in blocks:
850
if new_start != current_pos:
851
# if copies_without_insertion:
852
# self.flush_multi(copies_without_insertion,
853
# lines, new_lines, index_lines)
854
# copies_without_insertion = []
855
# non-matching region
856
flush_range(current_pos, None, new_start - current_pos,
857
lines, new_lines, index_lines)
858
current_pos = new_start + range_len
861
# copies_without_insertion.append((new_start, old_start, range_len))
862
flush_range(new_start, old_start, range_len,
863
lines, new_lines, index_lines)
864
# if copies_without_insertion:
865
# self.flush_multi(copies_without_insertion,
866
# lines, new_lines, index_lines)
867
# copies_without_insertion = []
861
# this is a worthy delta, output it
864
# Update the delta_length to include those two encoded integers
865
out_lines[1] = encode_base128_int(delta_length)
866
out_length = len(out_lines[3]) + 1 + delta_length
867
self._block.add_entry(key, type=type, sha1=sha1,
868
start=self.endpoint, length=out_length)
868
869
start = self.endpoint # Keep it
869
870
delta_start = (self.endpoint, len(self.lines))
870
self.output_lines(new_lines, index_lines)
871
trim_encoding_newline(lines)
872
length = sum(map(len, lines))
873
self.input_bytes += length
871
self.output_lines(out_lines, index_lines)
872
self.input_bytes += bytes_length
874
873
delta_end = (self.endpoint, len(self.lines))
875
874
self.labels_deltas[key] = (delta_start, delta_end)
876
875
# FIXME: lot of guessing below
877
return sha1, start, self.endpoint, 'delta', length
879
def extract(self, key):
880
"""Extract a key previously added to the compressor.
882
:param key: The key to extract.
883
:return: An iterable over bytes and the sha1.
885
delta_details = self.labels_deltas[key]
886
delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
887
label, sha1, delta = parse(delta_lines)
888
## delta = parse(delta_lines)
890
raise AssertionError("wrong key: %r, wanted %r" % (label, key))
891
# Perhaps we want to keep the line offsets too in memory at least?
892
chunks = apply_delta(''.join(self.lines), delta)
893
sha1 = osutils.sha_strings(chunks)
896
def flush_multi(self, instructions, lines, new_lines, index_lines):
897
"""Flush a bunch of different ranges out.
899
This should only be called with data that are "pure" copies.
901
flush_range = self.flush_range
902
if len(instructions) > 2:
903
# This is the number of lines to be copied
904
total_copy_range = sum(i[2] for i in instructions)
905
if len(instructions) > 0.5 * total_copy_range:
906
# We are copying N lines, but taking more than N/2
907
# copy instructions to do so. We will go ahead and expand this
908
# text so that other code is able to match against it
909
flush_range(instructions[0][0], None, total_copy_range,
910
lines, new_lines, index_lines)
912
for ns, os, rl in instructions:
913
flush_range(ns, os, rl, lines, new_lines, index_lines)
915
def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
916
insert_instruction = "i,%d\n" % range_len
917
if copy_start is not None:
918
# range stops, flush and start a new copy range
919
stop_byte = self.line_offsets[copy_start + range_len - 1]
923
start_byte = self.line_offsets[copy_start - 1]
924
bytes = stop_byte - start_byte
925
copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
926
if (bytes + len(insert_instruction) >
927
len(copy_control_instruction)):
928
new_lines.append(copy_control_instruction)
929
index_lines.append(False)
931
# not copying, or inserting is shorter than copying, so insert.
932
new_lines.append(insert_instruction)
933
new_lines.extend(lines[range_start:range_start+range_len])
934
index_lines.append(False)
935
index_lines.extend([copy_start is None]*range_len)
876
return sha1, start, self.endpoint, 'delta', out_length
938
879
# FIXME: ugly hack to masquerade ourself as the pyrex version
939
class content(object):
941
def __init__(self, s):
947
return content(zlib.compress(''.join(self.lines)))
880
self._block.set_content(''.join(self.lines))
949
883
def output_lines(self, new_lines, index_lines):
950
884
"""Output some lines.