~bzr-pqm/bzr/bzr.dev

0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.17.5 by Robert Collins
nograph tests completely passing.
56
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
57
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
58
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
59
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
60
    label_line = lines.next()
61
    sha1_line = lines.next()
62
    if (not label_line.startswith('label: ') or
63
        not sha1_line.startswith('sha1: ')):
64
        raise AssertionError("bad text record %r" % lines)
65
    label = tuple(label_line[7:-1].split('\x00'))
66
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
67
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
68
        op = header[0]
69
        numbers = header[2:]
70
        numbers = [int(n) for n in header[2:].split(',')]
71
        if op == 'c':
72
            result.append((op, numbers[0], numbers[1], None))
73
        else:
74
            contents = [next() for i in xrange(numbers[0])]
75
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
76
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
77
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
78
0.20.9 by John Arbash Meinel
Revert previous change.
79
def apply_delta(basis, delta):
0.17.2 by Robert Collins
Core proof of concept working.
80
    """Apply delta to this object to become new_version_id."""
81
    lines = []
82
    last_offset = 0
83
    # eq ranges occur where gaps occur
84
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
85
    for op, start, count, delta_lines in delta:
86
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
87
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
88
        else:
89
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
90
    trim_encoding_newline(lines)
91
    return lines
92
93
94
def trim_encoding_newline(lines):
95
    if lines[-1] == '\n':
96
        del lines[-1]
97
    else:
98
        lines[-1] = lines[-1][:-1]
99
100
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
101
102
def sort_gc_optimal(parent_map):
103
    """Sort and group the keys in parent_map into gc-optimal order.
104
105
    gc-optimal is defined (currently) as reverse-topological order, grouped by
106
    the key prefix.
107
108
    :return: A sorted-list of keys
109
    """
110
    # gc-optimal ordering is approximately reverse topological,
111
    # properly grouped by file-id.
112
    per_prefix_map = {'': []}
113
    present_keys = []
114
    for item in parent_map.iteritems():
115
        key = item[0]
116
        if isinstance(key, str) or len(key) == 1:
117
            per_prefix_map[''].append(item)
118
        else:
119
            try:
120
                per_prefix_map[key[0]].append(item)
121
            except KeyError:
122
                per_prefix_map[key[0]] = [item]
123
124
    for prefix in sorted(per_prefix_map):
125
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
126
    return present_keys
127
128
0.17.2 by Robert Collins
Core proof of concept working.
129
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
130
    """Produce a serialised group of compressed texts.
131
    
132
    It contains code very similar to SequenceMatcher because of having a similar
133
    task. However some key differences apply:
134
     - there is no junk, we want a minimal edit not a human readable diff.
135
     - we don't filter very common lines (because we don't know where a good
136
       range will start, and after the first text we want to be emitting minmal
137
       edits only.
138
     - we chain the left side, not the right side
139
     - we incrementally update the adjacency matrix as new lines are provided.
140
     - we look for matches in all of the left side, so the routine which does
141
       the analagous task of find_longest_match does not need to filter on the
142
       left side.
143
    """
0.17.2 by Robert Collins
Core proof of concept working.
144
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
145
    _equivalence_table_class = equivalence_table.EquivalenceTable
146
0.17.2 by Robert Collins
Core proof of concept working.
147
    def __init__(self, delta=True):
148
        """Create a GroupCompressor.
149
150
        :paeam delta: If False, do not compress records.
151
        """
152
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
153
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
154
        self.endpoint = 0
155
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
156
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
157
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
158
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
159
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
160
    def get_matching_blocks(self, lines):
161
        """Return an the ranges in lines which match self.lines.
162
163
        :param lines: lines to compress
164
        :return: A list of (old_start, new_start, length) tuples which reflect
165
            a region in self.lines that is present in lines.  The last element
166
            of the list is always (old_len, new_len, 0) to provide a end point
167
            for generating instructions from the matching blocks list.
168
        """
169
        result = []
170
        pos = 0
171
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
172
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
173
        # We either copy a range (while there are reusable lines) or we 
174
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
175
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
176
        max_pos = len(lines)
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
177
        result_append = result.append
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
178
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
179
            block, pos, locations = _get_longest_match(line_locations, pos,
180
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
181
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
182
                result_append(block)
183
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
184
        return result
185
0.17.2 by Robert Collins
Core proof of concept working.
186
    def compress(self, key, lines, expected_sha):
187
        """Compress lines with label key.
188
189
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
190
            for identification of the text during decompression. If the last
191
            element is 'None' it is replaced with the sha1 of the text -
192
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
193
        :param lines: The lines to be compressed. Must be split
194
            on \n, with the \n preserved.'
195
        :param expected_sha: If non-None, the sha the lines are blieved to
196
            have. During compression the sha is calculated; a mismatch will
197
            cause an error.
198
        :return: The sha1 of lines, and the number of bytes accumulated in
199
            the group output so far.
200
        """
201
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
202
        if key[-1] is None:
203
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
204
        label = '\x00'.join(key)
205
        # setup good encoding for trailing \n support.
206
        if not lines or lines[-1].endswith('\n'):
207
            lines.append('\n')
208
        else:
209
            lines[-1] = lines[-1] + '\n'
210
        new_lines = []
211
        new_lines.append('label: %s\n' % label)
212
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
213
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
214
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
215
        range_len = 0
216
        range_start = 0
217
        flush_range = self.flush_range
218
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
219
        blocks = self.get_matching_blocks(lines)
220
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
221
        # We either copy a range (while there are reusable lines) or we 
222
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
223
        for old_start, new_start, range_len in blocks:
224
            if new_start != current_pos:
225
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
226
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
227
                    lines, new_lines, index_lines)
228
            current_pos = new_start + range_len
229
            if not range_len:
230
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
231
            flush_range(new_start, old_start, range_len, lines,
232
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
233
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
234
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
235
        trim_encoding_newline(lines)
236
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
237
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
238
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
239
        return sha1, self.endpoint
240
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
241
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
242
        """Extract a key previously added to the compressor.
243
        
244
        :param key: The key to extract.
245
        :return: An iterable over bytes and the sha1.
246
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
247
        delta_details = self.labels_deltas[key]
248
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
249
        label, sha1, delta = parse(delta_lines)
250
        if label != key:
251
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
252
        # Perhaps we want to keep the line offsets too in memory at least?
253
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
254
        sha1 = sha_strings(lines)
255
        return lines, sha1
256
0.19.1 by Robert Collins
Start to simplify flush_range.
257
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
258
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
259
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
260
            # range stops, flush and start a new copy range
261
            stop_byte = self.line_offsets[copy_start + range_len - 1]
262
            if copy_start == 0:
263
                start_byte = 0
264
            else:
265
                start_byte = self.line_offsets[copy_start - 1]
266
            bytes = stop_byte - start_byte
267
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
268
            if (bytes + len(insert_instruction) >
269
                len(copy_control_instruction)):
270
                new_lines.append(copy_control_instruction)
271
                index_lines.append(False)
272
                return
273
        # not copying, or inserting is shorter than copying, so insert.
274
        new_lines.append(insert_instruction)
275
        new_lines.extend(lines[range_start:range_start+range_len])
276
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
277
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
278
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
279
    def output_lines(self, new_lines, index_lines):
280
        """Output some lines.
281
282
        :param new_lines: The lines to output.
283
        :param index_lines: A boolean flag for each line - when True, index
284
            that line.
285
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
286
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
287
        #                          if val and new_lines[idx] == '\n']
288
        # if indexed_newlines:
289
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
290
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
291
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
292
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
293
            endpoint += len(line)
294
            self.line_offsets.append(endpoint)
295
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
296
0.17.2 by Robert Collins
Core proof of concept working.
297
    def ratio(self):
298
        """Return the overall compression ratio."""
299
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
300
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
301
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
302
def make_pack_factory(graph, delta, keylength):
303
    """Create a factory for creating a pack based groupcompress.
304
305
    This is only functional enough to run interface tests, it doesn't try to
306
    provide a full pack environment.
307
    
308
    :param graph: Store a graph.
309
    :param delta: Delta compress contents.
310
    :param keylength: How long should keys be.
311
    """
312
    def factory(transport):
313
        parents = graph or delta
314
        ref_length = 0
315
        if graph:
316
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
317
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
318
            key_elements=keylength)
319
        stream = transport.open_write_stream('newpack')
320
        writer = pack.ContainerWriter(stream.write)
321
        writer.begin()
322
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
323
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
324
        access = _DirectPackAccess({})
325
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
326
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
327
        result.stream = stream
328
        result.writer = writer
329
        return result
330
    return factory
331
332
333
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
334
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
335
    versioned_files.stream.close()
336
337
338
class GroupCompressVersionedFiles(VersionedFiles):
339
    """A group-compress based VersionedFiles implementation."""
340
0.17.2 by Robert Collins
Core proof of concept working.
341
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
342
        """Create a GroupCompressVersionedFiles object.
343
344
        :param index: The index object storing access and graph data.
345
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
346
        :param delta: Whether to delta compress or just entropy compress.
347
        """
348
        self._index = index
349
        self._access = access
350
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
351
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
352
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
353
354
    def add_lines(self, key, parents, lines, parent_texts=None,
355
        left_matching_blocks=None, nostore_sha=None, random_id=False,
356
        check_content=True):
357
        """Add a text to the store.
358
359
        :param key: The key tuple of the text to add.
360
        :param parents: The parents key tuples of the text to add.
361
        :param lines: A list of lines. Each line must be a bytestring. And all
362
            of them except the last must be terminated with \n and contain no
363
            other \n's. The last line may either contain no \n's or a single
364
            terminating \n. If the lines list does meet this constraint the add
365
            routine may error or may succeed - but you will be unable to read
366
            the data back accurately. (Checking the lines have been split
367
            correctly is expensive and extremely unlikely to catch bugs so it
368
            is not done at runtime unless check_content is True.)
369
        :param parent_texts: An optional dictionary containing the opaque 
370
            representations of some or all of the parents of version_id to
371
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
372
            returned by add_lines or data corruption can be caused.
373
        :param left_matching_blocks: a hint about which areas are common
374
            between the text and its left-hand-parent.  The format is
375
            the SequenceMatcher.get_matching_blocks format.
376
        :param nostore_sha: Raise ExistingContent and do not add the lines to
377
            the versioned file if the digest of the lines matches this.
378
        :param random_id: If True a random id has been selected rather than
379
            an id determined by some deterministic process such as a converter
380
            from a foreign VCS. When True the backend may choose not to check
381
            for uniqueness of the resulting key within the versioned file, so
382
            this should only be done when the result is expected to be unique
383
            anyway.
384
        :param check_content: If True, the lines supplied are verified to be
385
            bytestrings that are correctly formed lines.
386
        :return: The text sha1, the number of bytes in the text, and an opaque
387
                 representation of the inserted version which can be provided
388
                 back to future add_lines calls in the parent_texts dictionary.
389
        """
390
        self._index._check_write_ok()
391
        self._check_add(key, lines, random_id, check_content)
392
        if parents is None:
393
            # The caller might pass None if there is no graph data, but kndx
394
            # indexes can't directly store that, so we give them
395
            # an empty tuple instead.
396
            parents = ()
397
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
398
        length = sum(map(len, lines))
399
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
400
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
401
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
402
0.17.4 by Robert Collins
Annotate.
403
    def annotate(self, key):
404
        """See VersionedFiles.annotate."""
405
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
406
        parent_map = self.get_parent_map([key])
407
        if not parent_map:
408
            raise errors.RevisionNotPresent(key, self)
409
        if parent_map[key] is not None:
410
            search = graph._make_breadth_first_searcher([key])
411
            keys = set()
412
            while True:
413
                try:
414
                    present, ghosts = search.next_with_ghosts()
415
                except StopIteration:
416
                    break
417
                keys.update(present)
418
            parent_map = self.get_parent_map(keys)
419
        else:
420
            keys = [key]
421
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
422
        head_cache = _mod_graph.FrozenHeadsCache(graph)
423
        parent_cache = {}
424
        reannotate = annotate.reannotate
425
        for record in self.get_record_stream(keys, 'topological', True):
426
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
427
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
428
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
429
            parent_cache[key] = list(
430
                reannotate(parent_lines, fulltext, key, None, head_cache))
431
        return parent_cache[key]
432
0.17.5 by Robert Collins
nograph tests completely passing.
433
    def check(self, progress_bar=None):
434
        """See VersionedFiles.check()."""
435
        keys = self.keys()
436
        for record in self.get_record_stream(keys, 'unordered', True):
437
            record.get_bytes_as('fulltext')
438
0.17.2 by Robert Collins
Core proof of concept working.
439
    def _check_add(self, key, lines, random_id, check_content):
440
        """check that version_id and lines are safe to add."""
441
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
442
        if version_id is not None:
443
            if contains_whitespace(version_id):
444
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
445
        self.check_not_reserved_id(version_id)
446
        # TODO: If random_id==False and the key is already present, we should
447
        # probably check that the existing content is identical to what is
448
        # being inserted, and otherwise raise an exception.  This would make
449
        # the bundle code simpler.
450
        if check_content:
451
            self._check_lines_not_unicode(lines)
452
            self._check_lines_are_lines(lines)
453
0.17.5 by Robert Collins
nograph tests completely passing.
454
    def get_parent_map(self, keys):
455
        """Get a map of the parents of keys.
456
457
        :param keys: The keys to look up parents for.
458
        :return: A mapping from keys to parents. Absent keys are absent from
459
            the mapping.
460
        """
461
        result = {}
462
        sources = [self._index]
463
        source_results = []
464
        missing = set(keys)
465
        for source in sources:
466
            if not missing:
467
                break
468
            new_result = source.get_parent_map(missing)
469
            source_results.append(new_result)
470
            result.update(new_result)
471
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
472
        if self._unadded_refs:
473
            for key in missing:
474
                if key in self._unadded_refs:
475
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
476
        return result
477
478
    def get_record_stream(self, keys, ordering, include_delta_closure):
479
        """Get a stream of records for keys.
480
481
        :param keys: The keys to include.
482
        :param ordering: Either 'unordered' or 'topological'. A topologically
483
            sorted stream has compression parents strictly before their
484
            children.
485
        :param include_delta_closure: If True then the closure across any
486
            compression parents will be included (in the opaque data).
487
        :return: An iterator of ContentFactory objects, each of which is only
488
            valid until the iterator is advanced.
489
        """
490
        # keys might be a generator
491
        keys = set(keys)
492
        if not keys:
493
            return
494
        if not self._index.has_graph:
495
            # Cannot topological order when no graph has been stored.
496
            ordering = 'unordered'
497
        # Cheap: iterate
498
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
499
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
500
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
501
        if ordering == 'topological':
502
            # would be better to not globally sort initially but instead
503
            # start with one key, recurse to its oldest parent, then grab
504
            # everything in the same group, etc.
505
            parent_map = dict((key, details[2]) for key, details in
506
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
507
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
508
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
509
            present_keys = topo_sort(parent_map)
510
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
511
        elif ordering == 'gc-optimal':
512
            parent_map = dict((key, details[2]) for key, details in
513
                locations.iteritems())
514
            for key in local_keys:
515
                parent_map[key] = self._unadded_refs[key]
516
            # XXX: This only optimizes for the target ordering. We may need to
517
            #      balance that with the time it takes to extract ordering, by
518
            #      somehow grouping based on locations[key][0:3]
519
            present_keys = sort_gc_optimal(parent_map)
0.17.5 by Robert Collins
nograph tests completely passing.
520
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
521
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
522
            # Otherwise we thrash the _group_cache and destroy performance
523
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
524
                # This is the group the bytes are stored in, followed by the
525
                # location in the group
526
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
527
            present_keys = sorted(locations.iterkeys(), key=get_group)
528
            # We don't have an ordering for keys in the in-memory object, but
529
            # lets process the in-memory ones first.
530
            local = list(local_keys)
531
            present_keys = list(local_keys) + present_keys
0.17.5 by Robert Collins
nograph tests completely passing.
532
        absent_keys = keys.difference(set(locations))
533
        for key in absent_keys:
534
            yield AbsentContentFactory(key)
535
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
536
            if key in self._unadded_refs:
537
                lines, sha1 = self._compressor.extract(key)
538
                parents = self._unadded_refs[key]
539
            else:
540
                index_memo, _, parents, (method, _) = locations[key]
541
                read_memo = index_memo[0:3]
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
542
                # get the group:
543
                try:
544
                    plain = self._group_cache[read_memo]
545
                except KeyError:
546
                    # read the group
547
                    zdata = self._access.get_raw_records([read_memo]).next()
548
                    # decompress - whole thing - this is not a bug, as it
549
                    # permits caching. We might want to store the partially
550
                    # decompresed group and decompress object, so that recent
551
                    # texts are not penalised by big groups.
552
                    decomp = zlib.decompressobj()
553
                    plain = decomp.decompress(zdata) #, index_memo[4])
554
                    self._group_cache[read_memo] = plain
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
555
                # cheapo debugging:
556
                # print len(zdata), len(plain)
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
557
                # parse - requires split_lines, better to have byte offsets
558
                # here (but not by much - we only split the region for the
559
                # recipe, and we often want to end up with lines anyway.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
560
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
561
                label, sha1, delta = parse(delta_lines)
562
                if label != key:
563
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.20.9 by John Arbash Meinel
Revert previous change.
564
                lines = apply_delta(plain, delta)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
565
            yield ChunkedContentFactory(key, parents, sha1, lines)
566
0.17.5 by Robert Collins
nograph tests completely passing.
567
    def get_sha1s(self, keys):
568
        """See VersionedFiles.get_sha1s()."""
569
        result = {}
570
        for record in self.get_record_stream(keys, 'unordered', True):
571
            if record.sha1 != None:
572
                result[record.key] = record.sha1
573
            else:
574
                if record.storage_kind != 'absent':
575
                    result[record.key] == sha_string(record.get_bytes_as(
576
                        'fulltext'))
577
        return result
578
0.17.2 by Robert Collins
Core proof of concept working.
579
    def insert_record_stream(self, stream):
580
        """Insert a record stream into this container.
581
582
        :param stream: A stream of records to insert. 
583
        :return: None
584
        :seealso VersionedFiles.get_record_stream:
585
        """
0.17.5 by Robert Collins
nograph tests completely passing.
586
        for _ in self._insert_record_stream(stream):
587
            pass
0.17.2 by Robert Collins
Core proof of concept working.
588
0.17.5 by Robert Collins
nograph tests completely passing.
589
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
590
        """Internal core to insert a record stream into this container.
591
592
        This helper function has a different interface than insert_record_stream
593
        to allow add_lines to be minimal, but still return the needed data.
594
595
        :param stream: A stream of records to insert. 
596
        :return: An iterator over the sha1 of the inserted records.
597
        :seealso insert_record_stream:
598
        :seealso add_lines:
599
        """
0.17.5 by Robert Collins
nograph tests completely passing.
600
        def get_adapter(adapter_key):
601
            try:
602
                return adapters[adapter_key]
603
            except KeyError:
604
                adapter_factory = adapter_registry.get(adapter_key)
605
                adapter = adapter_factory(self)
606
                adapters[adapter_key] = adapter
607
                return adapter
608
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
609
        # This will go up to fulltexts for gc to gc fetching, which isn't
610
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
611
        self._compressor = GroupCompressor(self._delta)
612
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
613
        keys_to_add = []
614
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
615
        groups = 1
616
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
617
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
618
            index, start, length = self._access.add_raw_records(
619
                [(None, len(compressed))], compressed)[0]
620
            nodes = []
621
            for key, reads, refs in keys_to_add:
622
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
623
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
624
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
625
            # Raise an error when a record is missing.
626
            if record.storage_kind == 'absent':
627
                raise errors.RevisionNotPresent([record.key], self)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
628
            elif record.storage_kind in ('chunked', 'fulltext'):
629
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.5 by Robert Collins
nograph tests completely passing.
630
            else:
631
                adapter_key = record.storage_kind, 'fulltext'
632
                adapter = get_adapter(adapter_key)
633
                bytes = adapter.get_bytes(record,
634
                    record.get_bytes_as(record.storage_kind))
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
635
                lines = osutils.split_lines(bytes)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
636
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
637
                lines, record.sha1)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
638
            if record.key[-1] is None:
639
                key = record.key[:-1] + ('sha1:' + found_sha1,)
640
            else:
641
                key = record.key
642
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
643
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
644
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
645
                (record.parents,)))
646
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
647
            if basis_end > 1024 * 1024 * 20:
648
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
649
                self._compressor = GroupCompressor(self._delta)
650
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
651
                keys_to_add = []
652
                basis_end = 0
653
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
654
        if len(keys_to_add):
655
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
656
        self._compressor = None
657
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
658
659
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
660
        """Iterate over the lines in the versioned files from keys.
661
662
        This may return lines from other keys. Each item the returned
663
        iterator yields is a tuple of a line and a text version that that line
664
        is present in (not introduced in).
665
666
        Ordering of results is in whatever order is most suitable for the
667
        underlying storage format.
668
669
        If a progress bar is supplied, it may be used to indicate progress.
670
        The caller is responsible for cleaning up progress bars (because this
671
        is an iterator).
672
673
        NOTES:
674
         * Lines are normalised by the underlying store: they will all have \n
675
           terminators.
676
         * Lines are returned in arbitrary order.
677
678
        :return: An iterator over (line, key).
679
        """
680
        if pb is None:
681
            pb = progress.DummyProgress()
682
        keys = set(keys)
683
        total = len(keys)
684
        # we don't care about inclusions, the caller cares.
685
        # but we need to setup a list of records to visit.
686
        # we need key, position, length
687
        for key_idx, record in enumerate(self.get_record_stream(keys,
688
            'unordered', True)):
689
            # XXX: todo - optimise to use less than full texts.
690
            key = record.key
691
            pb.update('Walking content.', key_idx, total)
692
            if record.storage_kind == 'absent':
693
                raise errors.RevisionNotPresent(record.key, self)
694
            lines = split_lines(record.get_bytes_as('fulltext'))
695
            for line in lines:
696
                yield line, key
697
        pb.update('Walking content.', total, total)
698
699
    def keys(self):
700
        """See VersionedFiles.keys."""
701
        if 'evil' in debug.debug_flags:
702
            trace.mutter_callsite(2, "keys scales with size of history")
703
        sources = [self._index]
704
        result = set()
705
        for source in sources:
706
            result.update(source.keys())
707
        return result
708
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
709
710
class _GCGraphIndex(object):
711
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
712
0.17.9 by Robert Collins
Initial stab at repository format support.
713
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
714
        add_callback=None):
715
        """Construct a _GCGraphIndex on a graph_index.
716
717
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
718
        :param is_locked: A callback to check whether the object should answer
719
            queries.
720
        :param parents: If True, record knits parents, if not do not record 
721
            parents.
722
        :param add_callback: If not None, allow additions to the index and call
723
            this callback with a list of added GraphIndex nodes:
724
            [(node, value, node_refs), ...]
725
        :param is_locked: A callback, returns True if the index is locked and
726
            thus usable.
727
        """
728
        self._add_callback = add_callback
729
        self._graph_index = graph_index
730
        self._parents = parents
731
        self.has_graph = parents
732
        self._is_locked = is_locked
733
0.17.5 by Robert Collins
nograph tests completely passing.
734
    def add_records(self, records, random_id=False):
735
        """Add multiple records to the index.
736
        
737
        This function does not insert data into the Immutable GraphIndex
738
        backing the KnitGraphIndex, instead it prepares data for insertion by
739
        the caller and checks that it is safe to insert then calls
740
        self._add_callback with the prepared GraphIndex nodes.
741
742
        :param records: a list of tuples:
743
                         (key, options, access_memo, parents).
744
        :param random_id: If True the ids being added were randomly generated
745
            and no check for existence will be performed.
746
        """
747
        if not self._add_callback:
748
            raise errors.ReadOnlyError(self)
749
        # we hope there are no repositories with inconsistent parentage
750
        # anymore.
751
752
        changed = False
753
        keys = {}
754
        for (key, value, refs) in records:
755
            if not self._parents:
756
                if refs:
757
                    for ref in refs:
758
                        if ref:
759
                            raise KnitCorrupt(self,
760
                                "attempt to add node with parents "
761
                                "in parentless index.")
762
                    refs = ()
763
                    changed = True
764
            keys[key] = (value, refs)
765
        # check for dups
766
        if not random_id:
767
            present_nodes = self._get_entries(keys)
768
            for (index, key, value, node_refs) in present_nodes:
769
                if node_refs != keys[key][1]:
770
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
771
                        ": %s %s" % ((value, node_refs), keys[key]))
772
                del keys[key]
773
                changed = True
774
        if changed:
775
            result = []
776
            if self._parents:
777
                for key, (value, node_refs) in keys.iteritems():
778
                    result.append((key, value, node_refs))
779
            else:
780
                for key, (value, node_refs) in keys.iteritems():
781
                    result.append((key, value))
782
            records = result
783
        self._add_callback(records)
784
        
785
    def _check_read(self):
786
        """raise if reads are not permitted."""
787
        if not self._is_locked():
788
            raise errors.ObjectNotLocked(self)
789
0.17.2 by Robert Collins
Core proof of concept working.
790
    def _check_write_ok(self):
791
        """Assert if writes are not permitted."""
792
        if not self._is_locked():
793
            raise errors.ObjectNotLocked(self)
794
0.17.5 by Robert Collins
nograph tests completely passing.
795
    def _get_entries(self, keys, check_present=False):
796
        """Get the entries for keys.
797
        
798
        :param keys: An iterable of index key tuples.
799
        """
800
        keys = set(keys)
801
        found_keys = set()
802
        if self._parents:
803
            for node in self._graph_index.iter_entries(keys):
804
                yield node
805
                found_keys.add(node[1])
806
        else:
807
            # adapt parentless index to the rest of the code.
808
            for node in self._graph_index.iter_entries(keys):
809
                yield node[0], node[1], node[2], ()
810
                found_keys.add(node[1])
811
        if check_present:
812
            missing_keys = keys.difference(found_keys)
813
            if missing_keys:
814
                raise RevisionNotPresent(missing_keys.pop(), self)
815
816
    def get_parent_map(self, keys):
817
        """Get a map of the parents of keys.
818
819
        :param keys: The keys to look up parents for.
820
        :return: A mapping from keys to parents. Absent keys are absent from
821
            the mapping.
822
        """
823
        self._check_read()
824
        nodes = self._get_entries(keys)
825
        result = {}
826
        if self._parents:
827
            for node in nodes:
828
                result[node[1]] = node[3][0]
829
        else:
830
            for node in nodes:
831
                result[node[1]] = None
832
        return result
833
834
    def get_build_details(self, keys):
835
        """Get the various build details for keys.
836
837
        Ghosts are omitted from the result.
838
839
        :param keys: An iterable of keys.
840
        :return: A dict of key:
841
            (index_memo, compression_parent, parents, record_details).
842
            index_memo
843
                opaque structure to pass to read_records to extract the raw
844
                data
845
            compression_parent
846
                Content that this record is built upon, may be None
847
            parents
848
                Logical parents of this node
849
            record_details
850
                extra information about the content which needs to be passed to
851
                Factory.parse_record
852
        """
853
        self._check_read()
854
        result = {}
855
        entries = self._get_entries(keys, False)
856
        for entry in entries:
857
            key = entry[1]
858
            if not self._parents:
859
                parents = None
860
            else:
861
                parents = entry[3][0]
862
            value = entry[2]
863
            method = 'group'
864
            result[key] = (self._node_to_position(entry),
865
                                  None, parents, (method, None))
866
        return result
867
    
868
    def keys(self):
869
        """Get all the keys in the collection.
870
        
871
        The keys are not ordered.
872
        """
873
        self._check_read()
874
        return [node[1] for node in self._graph_index.iter_all_entries()]
875
    
876
    def _node_to_position(self, node):
877
        """Convert an index value to position details."""
878
        bits = node[2].split(' ')
879
        # It would be nice not to read the entire gzip.
880
        start = int(bits[0])
881
        stop = int(bits[1])
882
        basis_end = int(bits[2])
883
        delta_end = int(bits[3])
884
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
885
886
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
887
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
888
    """Get the longest possible match for the current position."""
889
    range_start = pos
890
    range_len = 0
891
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
892
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
893
        if locations is None:
894
            locations = equivalence_table.get_idx_matches(pos)
895
        if locations is None:
896
            # No more matches, just return whatever we have, but we know that
897
            # this last position is not going to match anything
898
            pos += 1
899
            break
900
        else:
901
            if copy_ends is None:
902
                # We are starting a new range
903
                copy_ends = [loc + 1 for loc in locations]
904
                range_len = 1
905
                locations = None # Consumed
906
            else:
907
                # We are currently in the middle of a match
908
                next_locations = set(copy_ends).intersection(locations)
909
                if len(next_locations):
910
                    # range continues
911
                    copy_ends = [loc + 1 for loc in next_locations]
912
                    range_len += 1
913
                    locations = None # Consumed
914
                else:
915
                    # But we are done with this match, we should be
916
                    # starting a new one, though. We will pass back 'locations'
917
                    # so that we don't have to do another lookup.
918
                    break
919
        pos += 1
920
    if copy_ends is None:
921
        return None, pos, locations
922
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
923
924
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
925
try:
926
    from bzrlib.plugins.groupcompress import _groupcompress_c
927
except ImportError:
928
    pass
929
else:
930
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
931
    _get_longest_match = _groupcompress_c._get_longest_match