~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to groupcompress.py

new encoder, allows non monotonically increasing sequence matches for moar compression.

Show diffs side-by-side

added added

removed removed

Lines of Context:
39
39
    next = lines.next
40
40
    print next(), next()
41
41
    for header in lines:
42
 
        start, end, count = [int(n) for n in header.split(',')]
43
 
        contents = [next() for i in xrange(count)]
44
 
        result.append((start, end, count, contents))
 
42
        op = header[0]
 
43
        numbers = header[2:]
 
44
        numbers = [int(n) for n in header[2:].split(',')]
 
45
        if op == 'c':
 
46
            result.append((op, numbers[0], numbers[1], None))
 
47
        else:
 
48
            contents = [next() for i in xrange(numbers[0])]
 
49
            result.append((op, None, numbers[0], contents))
45
50
    return result
46
51
 
47
52
def apply_delta(basis, delta):
50
55
    last_offset = 0
51
56
    # eq ranges occur where gaps occur
52
57
    # start, end refer to offsets in basis
53
 
    for start, end, count, delta_lines in delta:
54
 
        if last_offset != start: # copy an eq range
55
 
            lines.extend(basis[last_offset:start])
56
 
        lines[start:end] = delta_lines
57
 
        last_offset = end
58
 
    if last_offset != len(basis):
59
 
        lines.extend(basis[last_offset:])
 
58
    for op, start, count, delta_lines in delta:
 
59
        if op == 'c':
 
60
            lines.extend(basis[start:start+count])
 
61
        else:
 
62
            lines.extend(delta_lines)
60
63
    trim_encoding_newline(lines)
61
64
    return lines
62
65
 
69
72
 
70
73
 
71
74
class GroupCompressor(object):
72
 
    """Produce a serialised group of compressed texts."""
 
75
    """Produce a serialised group of compressed texts.
 
76
    
 
77
    It contains code very similar to SequenceMatcher because of having a similar
 
78
    task. However some key differences apply:
 
79
     - there is no junk, we want a minimal edit not a human readable diff.
 
80
     - we don't filter very common lines (because we don't know where a good
 
81
       range will start, and after the first text we want to be emitting minmal
 
82
       edits only.
 
83
     - we chain the left side, not the right side
 
84
     - we incrementally update the adjacency matrix as new lines are provided.
 
85
     - we look for matches in all of the left side, so the routine which does
 
86
       the analagous task of find_longest_match does not need to filter on the
 
87
       left side.
 
88
    """
73
89
 
74
90
    def __init__(self, delta=True):
75
91
        """Create a GroupCompressor.
80
96
        self.lines = []
81
97
        self.endpoint = 0
82
98
        self.input_bytes = 0
 
99
        # line: set(locations it appears at), set(N+1 for N in locations)
 
100
        self.line_locations = {}
83
101
 
84
102
    def compress(self, key, lines, expected_sha):
85
103
        """Compress lines with label key.
104
122
        new_lines = []
105
123
        new_lines.append('label: %s\n' % label)
106
124
        new_lines.append('sha1: %s\n' % sha1)
107
 
        if 0:
108
 
            delta_seq = diff.difflib.SequenceMatcher(
109
 
                None, self.lines, lines)
110
 
        else:
111
 
            delta_seq = patiencediff.PatienceSequenceMatcher(
112
 
                None, self.lines, lines)
113
 
        diff_hunks = []
114
 
        for op in delta_seq.get_opcodes():
115
 
            if op[0] == 'equal':
116
 
                continue
117
 
            diff_hunks.append((op[1], op[2], op[4]-op[3], lines[op[3]:op[4]]))
118
 
        for start, end, count, new in diff_hunks:
119
 
            new_lines.append('%d,%d,%d\n' % (start, end, count))
120
 
            new_lines.extend(new)
121
 
        self.endpoint += sum(map(len, new_lines))
122
 
        self.lines.extend(new_lines)
 
125
        pos = 0
 
126
        line_locations = self.line_locations
 
127
        accumulator = []
 
128
        copying = False
 
129
        new_len = 0
 
130
        new_start = 0
 
131
        # We either copy a range (while there are reusable lines) or we 
 
132
        # insert new lines. To find reusable lines we traverse 
 
133
        while pos < len(lines):
 
134
            line = lines[pos]
 
135
            if line not in line_locations:
 
136
                if copying:
 
137
                    # flush the copy
 
138
                    copy_start = min(copy_ends) - copy_len
 
139
                    new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
 
140
                    copying = False
 
141
                    new_start = pos
 
142
                    new_len = 1
 
143
                else:
 
144
                    new_len += 1
 
145
            else:
 
146
                if copying:
 
147
                    locations, next = line_locations[line]
 
148
                    next_locations = locations.intersection(copy_ends)
 
149
                    if len(next_locations):
 
150
                        # range continues
 
151
                        copy_len += 1
 
152
                        copy_ends = set(loc + 1 for loc in next_locations)
 
153
                    else:
 
154
                        # range stops, flush and start a new copy range
 
155
                        copy_start = min(copy_ends) - copy_len
 
156
                        new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
 
157
                        copy_len = 1
 
158
                        copy_ends = next
 
159
                else:
 
160
                    # Flush
 
161
                    if new_len:
 
162
                        new_lines.append("i,%d\n" % new_len)
 
163
                        new_lines.extend(lines[new_start:new_start+new_len])
 
164
                    # setup a copy
 
165
                    copy_len = 1
 
166
                    copy_ends = line_locations[line][1]
 
167
                    copying = True
 
168
            pos += 1
 
169
        if copying:
 
170
            copy_start = min(copy_ends) - copy_len
 
171
            new_lines.append("c,%d,%d\n" % (copy_start,copy_len))
 
172
        elif new_len:
 
173
            new_lines.append("i,%d\n" % new_len)
 
174
            new_lines.extend(lines[new_start:new_start+new_len])
 
175
 
 
176
        self.output_lines(new_lines)
123
177
        trim_encoding_newline(lines)
124
178
        self.input_bytes += sum(map(len, lines))
125
179
        return sha1, self.endpoint
126
180
 
 
181
    def output_lines(self, new_lines):
 
182
        self.endpoint += sum(map(len, new_lines))
 
183
        offset = len(self.lines)
 
184
        self.lines.extend(new_lines)
 
185
        for pos, line in enumerate(new_lines):
 
186
            indices, next_lines = self.line_locations.setdefault(line,
 
187
                (set(), set()))
 
188
            indices.add(pos + offset)
 
189
            next_lines.add(pos + offset + 1)
 
190
 
127
191
    def ratio(self):
128
192
        """Return the overall compression ratio."""
129
193
        return float(self.input_bytes) / float(self.endpoint)
224
288
        # double handling for now. Make it work until then.
225
289
        bytes = ''.join(lines)
226
290
        record = FulltextContentFactory(key, parents, None, bytes)
227
 
        sha1 = self._insert_record_stream([record])
 
291
        sha1 = self._insert_record_stream([record]).next()
228
292
        return sha1, len(bytes), None
229
293
 
230
294
    def _check_add(self, key, lines, random_id, check_content):
267
331
        for record in stream:
268
332
            found_sha1, end_point = compressor.compress(record.key,
269
333
                split_lines(record.get_bytes_as('fulltext')), record.sha1)
 
334
            yield found_sha1
270
335
 
271
336
class _GCGraphIndex(object):
272
337
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""