~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/groupcompress.py

Committer: Vincent Ladeuil
Date: 2009-03-27 07:53:57 UTC
mfrom: (3735.40.7 vilajam)
mto: (3735.40.8 vilajam)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: v.ladeuil+lp@free.fr-20090327075357-xeqzulo3r406b1jt

Bring in jam's changes

files renamed:
bzrlib/tests/test__groupcompress_pyx.py => bzrlib/tests/test__groupcompress.py

files modified:
bzrlib/_groupcompress_py.py

bzrlib/groupcompress.py

bzrlib/tests/__init__.py

bzrlib/tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

bzrlib/groupcompress.py

734

735

def __init__(self):

736

"""Create a GroupCompressor."""

737

# Consider seeding the lines with some sort of GC Start flag, or

738

# putting it as part of the output stream, rather than in the

739

# compressed bytes.

740

737

self.lines = []

738

self._last = None

741

739

self.endpoint = 0

742

740

self.input_bytes = 0

743

741

self.labels_deltas = {}

742

self._block = GroupCompressBlock()

743

744

def extract(self, key):

745

"""Extract a key previously added to the compressor.

746

747

:param key: The key to extract.

748

:return: An iterable over bytes and the sha1.

749

"""

750

delta_details = self.labels_deltas[key]

751

delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]

752

stored_bytes = ''.join(delta_chunks)

753

# TODO: Fix this, we shouldn't really be peeking here

754

entry = self._block._entries[key]

755

if entry.type == 'fulltext':

756

if stored_bytes[0] != 'f':

757

raise ValueError('Index claimed fulltext, but stored bytes'

758

' indicate %s' % (stored_bytes[0],))

759

fulltext_len, offset = decode_base128_int(stored_bytes[1:10])

760

if fulltext_len + 1 + offset != len(stored_bytes):

761

raise ValueError('Index claimed fulltext len, but stored bytes'

762

' claim %s != %s'

763

% (len(stored_bytes),

764

fulltext_len + 1 + offset))

765

bytes = stored_bytes[offset + 1:]

766

else:

767

if entry.type != 'delta':

768

raise ValueError('Unknown entry type: %s' % (entry.type,))

769

# XXX: This is inefficient at best

770

source = ''.join(self.lines)

771

if stored_bytes[0] != 'd':

772

raise ValueError('Entry type claims delta, bytes claim %s'

773

% (stored_bytes[0],))

774

delta_len, offset = decode_base128_int(stored_bytes[1:10])

775

if delta_len + 1 + offset != len(stored_bytes):

776

raise ValueError('Index claimed delta len, but stored bytes'

777

' claim %s != %s'

778

% (len(stored_bytes),

779

delta_len + 1 + offset))

780

bytes = apply_delta(source, stored_bytes[offset + 1:])

781

bytes_sha1 = osutils.sha_string(bytes)

782

if entry.sha1 != bytes_sha1:

783

raise ValueError('Recorded sha1 != measured %s != %s'

784

% (entry.sha1, bytes_sha1))

785

return bytes, entry.sha1

786

787

def pop_last(self):

788

"""Call this if you want to 'revoke' the last compression.

789

790

After this, the data structures will be rolled back, but you cannot do

791

more compression.

792

"""

793

self._delta_index = None

794

del self.lines[self._last[0]:]

795

self.endpoint = self._last[1]

796

self._last = None

744

797

745

798

def ratio(self):

746

799

"""Return the overall compression ratio."""

749

802

750

803

class PythonGroupCompressor(_CommonGroupCompressor):

751

804

752

def __init__(self, delta=True):

805

def __init__(self):

753

806

"""Create a GroupCompressor.

754

807

755

808

:param delta: If False, do not compress records.

756

809

"""

757

810

super(PythonGroupCompressor, self).__init__()

758

self._delta = delta

759

self.line_offsets = []

760

811

self.line_locations = EquivalenceTable([])

761

812

self.lines = self.line_locations.lines

762

813

self._present_prefixes = set()

763

814

764

def get_matching_blocks(self, lines, soft=False):

765

"""Return the ranges in lines which match self.lines.

766

767

:param lines: lines to compress

768

:return: A list of (old_start, new_start, length) tuples which reflect

769

a region in self.lines that is present in lines. The last element

770

of the list is always (old_len, new_len, 0) to provide a end point

771

for generating instructions from the matching blocks list.

772

"""

773

result = []

774

pos = 0

775

line_locations = self.line_locations

776

line_locations.set_right_lines(lines)

777

# We either copy a range (while there are reusable lines) or we

778

# insert new lines. To find reusable lines we traverse

779

locations = None

780

max_pos = len(lines)

781

result_append = result.append

782

min_match_bytes = 10

783

if soft:

784

min_match_bytes = 200

785

while pos < max_pos:

786

block, pos, locations = _get_longest_match(line_locations, pos,

787

max_pos, locations)

788

if block is not None:

789

# Check to see if we are matching fewer than 5 characters,

790

# which is turned into a simple 'insert', rather than a copy

791

# If we have more than 5 lines, we definitely have more than 5

792

# chars

793

if block[-1] < min_match_bytes:

794

# This block may be a 'short' block, check

795

old_start, new_start, range_len = block

796

matched_bytes = sum(map(len,

797

lines[new_start:new_start + range_len]))

798

if matched_bytes < min_match_bytes:

799

block = None

800

if block is not None:

801

result_append(block)

802

result_append((len(self.lines), len(lines), 0))

803

return result

804

805

815

# FIXME: implement nostore_sha

806

def compress(self, key, lines, expected_sha, nostore_sha=False, soft=False):

816

def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):

807

817

"""Compress lines with label key.

808

818

809

819

:param key: A key tuple. It is stored in the output

810

820

for identification of the text during decompression. If the last

811

821

element is 'None' it is replaced with the sha1 of the text -

812

822

e.g. sha1:xxxxxxx.

813

:param lines: The lines to be compressed. Must be split

814

on \n, with the \n preserved.'

823

:param bytes: The bytes to be compressed

815

824

:param expected_sha: If non-None, the sha the lines are believed to

816

825

have. During compression the sha is calculated; a mismatch will

817

826

cause an error.

827

:param nostore_sha: If the computed sha1 sum matches, we will raise

828

ExistingContent rather than adding the text.

818

829

:param soft: Do a 'soft' compression. This means that we require larger

819

830

ranges to match to be considered for a copy command.

820

831

:return: The sha1 of lines, and the number of bytes accumulated in

821

832

the group output so far.

833

:seealso VersionedFiles.add_lines:

822

834

"""

823

lines = osutils.split_lines(lines)

824

sha1 = osutils.sha_strings(lines)

835

if not bytes: # empty, like a dir entry, etc

836

if nostore_sha == _null_sha1:

837

raise errors.ExistingContent()

838

self._block.add_entry(key, type='empty',

839

sha1=None, start=0,

840

length=0)

841

return _null_sha1, 0, 0, 'fulltext', 0

842

bytes_length = len(bytes)

843

new_lines = osutils.split_lines(bytes)

844

sha1 = osutils.sha_string(bytes)

845

if sha1 == nostore_sha:

846

raise errors.ExistingContent()

825

847

if key[-1] is None:

826

848

key = key[:-1] + ('sha1:' + sha1,)

827

label = '\x00'.join(key)

828

## new_lines = []

829

new_lines = ['label: %s\n' % label,

830

'sha1: %s\n' % sha1,

831

]

832

## index_lines = []

833

index_lines = [False, False]

834

# setup good encoding for trailing \n support.

835

if not lines or lines[-1].endswith('\n'):

836

lines.append('\n')

849

out_lines, index_lines = self.line_locations.make_delta(new_lines,

850

soft=soft)

851

delta_length = sum(map(len, out_lines))

852

if delta_length * 2 > bytes_length:

853

# The delta is longer than the fulltext, insert a fulltext

854

type = 'fulltext'

855

out_lines = ['f', encode_base128_int(bytes_length)]

856

out_lines.extend(new_lines)

857

index_lines = [False, False]

858

index_lines.extend([True] * len(new_lines))

859

out_length = len(out_lines[1]) + bytes_length + 1

837

860

else:

838

lines[-1] = lines[-1] + '\n'

839

pos = 0

840

range_len = 0

841

range_start = 0

842

flush_range = self.flush_range

843

copy_ends = None

844

blocks = self.get_matching_blocks(lines, soft=soft)

845

current_pos = 0

846

#copies_without_insertion = []

847

# We either copy a range (while there are reusable lines) or we

848

# insert new lines. To find reusable lines we traverse

849

for old_start, new_start, range_len in blocks:

850

if new_start != current_pos:

851

# if copies_without_insertion:

852

# self.flush_multi(copies_without_insertion,

853

# lines, new_lines, index_lines)

854

# copies_without_insertion = []

855

# non-matching region

856

flush_range(current_pos, None, new_start - current_pos,

857

lines, new_lines, index_lines)

858

current_pos = new_start + range_len

859

if not range_len:

860

continue

861

# copies_without_insertion.append((new_start, old_start, range_len))

862

flush_range(new_start, old_start, range_len,

863

lines, new_lines, index_lines)

864

# if copies_without_insertion:

865

# self.flush_multi(copies_without_insertion,

866

# lines, new_lines, index_lines)

867

# copies_without_insertion = []

861

# this is a worthy delta, output it

862

type = 'delta'

863

out_lines[0] = 'd'

864

# Update the delta_length to include those two encoded integers

865

out_lines[1] = encode_base128_int(delta_length)

866

out_length = len(out_lines[3]) + 1 + delta_length

867

self._block.add_entry(key, type=type, sha1=sha1,

868

start=self.endpoint, length=out_length)

868

869

start = self.endpoint # Keep it

869

870

delta_start = (self.endpoint, len(self.lines))

870

self.output_lines(new_lines, index_lines)

871

trim_encoding_newline(lines)

872

length = sum(map(len, lines))

873

self.input_bytes += length

871

self.output_lines(out_lines, index_lines)

872

self.input_bytes += bytes_length

874

873

delta_end = (self.endpoint, len(self.lines))

875

874

self.labels_deltas[key] = (delta_start, delta_end)

876

875

# FIXME: lot of guessing below

877

return sha1, start, self.endpoint, 'delta', length

878

879

def extract(self, key):

880

"""Extract a key previously added to the compressor.

881

882

:param key: The key to extract.

883

:return: An iterable over bytes and the sha1.

884

"""

885

delta_details = self.labels_deltas[key]

886

delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]

887

label, sha1, delta = parse(delta_lines)

888

## delta = parse(delta_lines)

889

if label != key:

890

raise AssertionError("wrong key: %r, wanted %r" % (label, key))

891

# Perhaps we want to keep the line offsets too in memory at least?

892

chunks = apply_delta(''.join(self.lines), delta)

893

sha1 = osutils.sha_strings(chunks)

894

return chunks, sha1

895

896

def flush_multi(self, instructions, lines, new_lines, index_lines):

897

"""Flush a bunch of different ranges out.

898

899

This should only be called with data that are "pure" copies.

900

"""

901

flush_range = self.flush_range

902

if len(instructions) > 2:

903

# This is the number of lines to be copied

904

total_copy_range = sum(i[2] for i in instructions)

905

if len(instructions) > 0.5 * total_copy_range:

906

# We are copying N lines, but taking more than N/2

907

# copy instructions to do so. We will go ahead and expand this

908

# text so that other code is able to match against it

909

flush_range(instructions[0][0], None, total_copy_range,

910

lines, new_lines, index_lines)

911

return

912

for ns, os, rl in instructions:

913

flush_range(ns, os, rl, lines, new_lines, index_lines)

914

915

def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):

916

insert_instruction = "i,%d\n" % range_len

917

if copy_start is not None:

918

# range stops, flush and start a new copy range

919

stop_byte = self.line_offsets[copy_start + range_len - 1]

920

if copy_start == 0:

921

start_byte = 0

922

else:

923

start_byte = self.line_offsets[copy_start - 1]

924

bytes = stop_byte - start_byte

925

copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)

926

if (bytes + len(insert_instruction) >

927

len(copy_control_instruction)):

928

new_lines.append(copy_control_instruction)

929

index_lines.append(False)

930

return

931

# not copying, or inserting is shorter than copying, so insert.

932

new_lines.append(insert_instruction)

933

new_lines.extend(lines[range_start:range_start+range_len])

934

index_lines.append(False)

935

index_lines.extend([copy_start is None]*range_len)

876

return sha1, start, self.endpoint, 'delta', out_length

936

877

937

878

def flush(self):

938

879

# FIXME: ugly hack to masquerade ourself as the pyrex version

939

class content(object):

940

941

def __init__(self, s):

942

self.s = s

943

944

def to_bytes(self):

945

return self.s

946

947

return content(zlib.compress(''.join(self.lines)))

880

self._block.set_content(''.join(self.lines))

881

return self._block

948

882

949

883

def output_lines(self, new_lines, index_lines):

950

884

"""Output some lines.

957

891

# if val and new_lines[idx] == '\n']

958

892

# if indexed_newlines:

959

893

# import pdb; pdb.set_trace()

894

self._last = (len(self.lines), self.endpoint)

960

895

endpoint = self.endpoint

961

896

self.line_locations.extend_lines(new_lines, index_lines)

962

897

for line in new_lines:

963

898

endpoint += len(line)

964

self.line_offsets.append(endpoint)

965

899

self.endpoint = endpoint

966

900

967

901

982

916

"""

983

917

984

918

def __init__(self):

985

super(PythonGroupCompressor, self).__init__()

919

super(PyrexGroupCompressor, self).__init__()

986

920

self.num_keys = 0

987

self._last = None

988

921

self._delta_index = DeltaIndex()

989

self._block = GroupCompressBlock()

990

922

991

923

def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):

992

924

"""Compress lines with label key.

1133

1065

endpoint += sum(map(len, new_chunks))

1134

1066

self.endpoint = endpoint

1135

1067

1136

def pop_last(self):

1137

"""Call this if you want to 'revoke' the last compression.

1138

1139

After this, the data structures will be rolled back, but you cannot do

1140

more compression.

1141

"""

1142

self._delta_index = None

1143

del self.lines[self._last[0]:]

1144

self.endpoint = self._last[1]

1145

self._last = None

1146

1147

1068

1148

1069

def make_pack_factory(graph, delta, keylength):

1149

1070

"""Create a factory for creating a pack based groupcompress.

1587

1508

result[record.key] = record.sha1

1588

1509

else:

1589

1510

if record.storage_kind != 'absent':

1590

result[record.key] = olsutils.sha_string(record.get_bytes_as(

1591

'fulltext'))

1511

result[record.key] = osutils.sha_string(

1512

record.get_bytes_as('fulltext'))

1592

1513

return result

1593

1514

1594

1515

def insert_record_stream(self, stream):

1985

1906

return node[0], start, stop, basis_end, delta_end

1986

1907

1987

1908

1909

from bzrlib._groupcompress_py import (

1910

apply_delta,

1911

encode_copy_instruction,

1912

EquivalenceTable,

1913

)

1988

1914

try:

1989

1915

from bzrlib._groupcompress_pyx import (

1990

1916

apply_delta,

1991

1917

DeltaIndex,

1992

1918

)

1993

GroupCompressor = PyrexCompressor

1919

GroupCompressor = PyrexGroupCompressor

1994

1920

except ImportError:

1995

from bzrlib._groupcompress_py import (

1996

apply_delta,

1997

EquivalenceTable,

1998

_get_longest_match,

1999

trim_encoding_newline,

2000

)

2001

1921

GroupCompressor = PythonGroupCompressor

2002

1922

Older »