~bzr-pqm/bzr/bzr.dev

0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
0.23.15 by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig.
55
_NO_LABELS = False
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
56
_FAST = False
0.17.2 by Robert Collins
Core proof of concept working.
57
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
58
def parse(bytes):
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
59
    if _NO_LABELS:
60
        action_byte = bytes[0]
61
        action = {'f':'fulltext', 'd':'delta'}[action_byte]
62
        return action, None, None, bytes[1:]
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
63
    (action, label_line, sha1_line, len_line,
0.23.16 by John Arbash Meinel
Properly restore the label functionality.
64
     delta_bytes) = bytes.split('\n', 4)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
65
    if (action not in ('fulltext', 'delta')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
66
        or not label_line.startswith('label:')
67
        or not sha1_line.startswith('sha1:')
68
        or not len_line.startswith('len:')
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
69
        ):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
70
        raise AssertionError("bad text record %r" % (bytes,))
0.23.34 by John Arbash Meinel
Update groupcompress to allow it to read older conversions.
71
    label = tuple(label_line[6:].lstrip(' ').split('\x00'))
72
    sha1 = sha1_line[5:].lstrip(' ')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
73
    length = int(len_line[4:])
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
74
    if not len(delta_bytes) == length:
75
        raise AssertionError("bad length record %r" % (bytes,))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
76
    return action, label, sha1, delta_bytes
77
78
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
79
def sort_gc_optimal(parent_map):
80
    """Sort and group the keys in parent_map into gc-optimal order.
81
82
    gc-optimal is defined (currently) as reverse-topological order, grouped by
83
    the key prefix.
84
85
    :return: A sorted-list of keys
86
    """
87
    # gc-optimal ordering is approximately reverse topological,
88
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
89
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
90
    for item in parent_map.iteritems():
91
        key = item[0]
92
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
93
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
94
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
95
            prefix = key[0]
96
        try:
97
            per_prefix_map[prefix].append(item)
98
        except KeyError:
99
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
100
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
101
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
102
    for prefix in sorted(per_prefix_map):
103
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
104
    return present_keys
105
106
0.17.2 by Robert Collins
Core proof of concept working.
107
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
108
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
109
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
110
    It contains code very similar to SequenceMatcher because of having a similar
111
    task. However some key differences apply:
112
     - there is no junk, we want a minimal edit not a human readable diff.
113
     - we don't filter very common lines (because we don't know where a good
114
       range will start, and after the first text we want to be emitting minmal
115
       edits only.
116
     - we chain the left side, not the right side
117
     - we incrementally update the adjacency matrix as new lines are provided.
118
     - we look for matches in all of the left side, so the routine which does
119
       the analagous task of find_longest_match does not need to filter on the
120
       left side.
121
    """
0.17.2 by Robert Collins
Core proof of concept working.
122
123
    def __init__(self, delta=True):
124
        """Create a GroupCompressor.
125
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
126
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
127
        """
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
128
        # Consider seeding the lines with some sort of GC Start flag, or
129
        # putting it as part of the output stream, rather than in the
130
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
131
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
132
        self.endpoint = 0
133
        self.input_bytes = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
134
        self.labels_deltas = {}
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
135
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
136
137
    def compress(self, key, chunks, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
138
        """Compress lines with label key.
139
140
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
141
            for identification of the text during decompression. If the last
142
            element is 'None' it is replaced with the sha1 of the text -
143
            e.g. sha1:xxxxxxx.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
144
        :param chunks: The chunks to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
145
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
146
            have. During compression the sha is calculated; a mismatch will
147
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
148
        :param soft: Do a 'soft' compression. This means that we require larger
149
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
150
        :return: The sha1 of lines, and the number of bytes accumulated in
151
            the group output so far.
152
        """
0.23.52 by John Arbash Meinel
Use the max_delta flag.
153
        # TODO: Change this to a bytes interface, since the output is now a
154
        #       bytes interface anyway.
155
        bytes = ''.join(chunks)
156
        sha1 = sha_string(bytes)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
157
        if key[-1] is None:
158
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
159
        label = '\x00'.join(key)
0.23.52 by John Arbash Meinel
Use the max_delta flag.
160
        input_len = len(bytes)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
161
        # By having action/label/sha1/len, we can parse the group if the index
162
        # was ever destroyed, we have the key in 'label', we know the final
163
        # bytes are valid from sha1, and we know where to find the end of this
164
        # record because of 'len'. (the delta record itself will store the
165
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
166
        # 'len: %d\n' costs approximately 1% increase in total data
167
        # Having the labels at all costs us 9-10% increase, 38% increase for
168
        # inventory pages, and 5.8% increase for text pages
169
        if _NO_LABELS:
170
            new_chunks = []
171
        else:
0.23.27 by John Arbash Meinel
Fix up some failing tests.
172
            new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
173
        if self._delta_index._source_offset != self.endpoint:
174
            raise AssertionError('_source_offset != endpoint'
175
                ' somehow the DeltaIndex got out of sync with'
176
                ' the output lines')
0.23.52 by John Arbash Meinel
Use the max_delta flag.
177
        max_delta_size = len(bytes) / 2
178
        delta = self._delta_index.make_delta(bytes, max_delta_size)
179
        if (delta is None):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
180
            # We can't delta (perhaps source_text is empty)
181
            # so mark this as an insert
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
182
            if _NO_LABELS:
183
                new_chunks = ['f']
184
            else:
185
                new_chunks.insert(0, 'fulltext\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
186
                new_chunks.append('len:%s\n' % (input_len,))
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
187
            unadded_bytes = sum(map(len, new_chunks))
0.23.52 by John Arbash Meinel
Use the max_delta flag.
188
            self._delta_index.add_source(bytes, unadded_bytes)
189
            new_chunks.append(bytes)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
190
        else:
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
191
            if _NO_LABELS:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
192
                new_chunks = ['d']
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
193
            else:
194
                new_chunks.insert(0, 'delta\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
195
                new_chunks.append('len:%s\n' % (len(delta),))
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
196
            if _FAST:
197
                new_chunks.append(delta)
198
                unadded_bytes = sum(map(len, new_chunks))
199
                self._delta_index._source_offset += unadded_bytes
200
            else:
201
                unadded_bytes = sum(map(len, new_chunks))
0.23.47 by John Arbash Meinel
Use the new add_delta_source.
202
                self._delta_index.add_delta_source(delta, unadded_bytes)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
203
                new_chunks.append(delta)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
204
        delta_start = (self.endpoint, len(self.lines))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
205
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
206
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
207
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
208
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
209
        if not self._delta_index._source_offset == self.endpoint:
210
            raise AssertionError('the delta index is out of sync'
211
                'with the output lines %s != %s'
212
                % (self._delta_index._source_offset, self.endpoint))
0.17.2 by Robert Collins
Core proof of concept working.
213
        return sha1, self.endpoint
214
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
215
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
216
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
217
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
218
        :param key: The key to extract.
219
        :return: An iterable over bytes and the sha1.
220
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
221
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
222
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
223
        action, label, sha1, delta = parse(''.join(delta_chunks))
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
224
        if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
225
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
226
        if action == 'fulltext':
227
            bytes = delta
228
        else:
229
            source = ''.join(self.lines[delta_details[0][0]])
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
230
            bytes = _groupcompress_pyx.apply_delta(source, delta)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
231
        if _NO_LABELS:
232
            sha1 = sha_string(bytes)
233
        else:
234
            assert sha1 == sha_string(bytes)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
235
        return [bytes], sha1
236
237
    def output_chunks(self, new_chunks):
238
        """Output some chunks.
239
240
        :param new_chunks: The chunks to output.
241
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
242
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
243
        self.lines.extend(new_chunks)
244
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
245
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
246
0.17.2 by Robert Collins
Core proof of concept working.
247
    def ratio(self):
248
        """Return the overall compression ratio."""
249
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
250
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
251
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
252
def make_pack_factory(graph, delta, keylength):
253
    """Create a factory for creating a pack based groupcompress.
254
255
    This is only functional enough to run interface tests, it doesn't try to
256
    provide a full pack environment.
257
    
258
    :param graph: Store a graph.
259
    :param delta: Delta compress contents.
260
    :param keylength: How long should keys be.
261
    """
262
    def factory(transport):
263
        parents = graph or delta
264
        ref_length = 0
265
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
266
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
267
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
268
            key_elements=keylength)
269
        stream = transport.open_write_stream('newpack')
270
        writer = pack.ContainerWriter(stream.write)
271
        writer.begin()
272
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
273
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
274
        access = _DirectPackAccess({})
275
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
276
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
277
        result.stream = stream
278
        result.writer = writer
279
        return result
280
    return factory
281
282
283
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
284
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
285
    versioned_files.stream.close()
286
287
288
class GroupCompressVersionedFiles(VersionedFiles):
289
    """A group-compress based VersionedFiles implementation."""
290
0.17.2 by Robert Collins
Core proof of concept working.
291
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
292
        """Create a GroupCompressVersionedFiles object.
293
294
        :param index: The index object storing access and graph data.
295
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
296
        :param delta: Whether to delta compress or just entropy compress.
297
        """
298
        self._index = index
299
        self._access = access
300
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
301
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
302
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
303
304
    def add_lines(self, key, parents, lines, parent_texts=None,
305
        left_matching_blocks=None, nostore_sha=None, random_id=False,
306
        check_content=True):
307
        """Add a text to the store.
308
309
        :param key: The key tuple of the text to add.
310
        :param parents: The parents key tuples of the text to add.
311
        :param lines: A list of lines. Each line must be a bytestring. And all
312
            of them except the last must be terminated with \n and contain no
313
            other \n's. The last line may either contain no \n's or a single
314
            terminating \n. If the lines list does meet this constraint the add
315
            routine may error or may succeed - but you will be unable to read
316
            the data back accurately. (Checking the lines have been split
317
            correctly is expensive and extremely unlikely to catch bugs so it
318
            is not done at runtime unless check_content is True.)
319
        :param parent_texts: An optional dictionary containing the opaque 
320
            representations of some or all of the parents of version_id to
321
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
322
            returned by add_lines or data corruption can be caused.
323
        :param left_matching_blocks: a hint about which areas are common
324
            between the text and its left-hand-parent.  The format is
325
            the SequenceMatcher.get_matching_blocks format.
326
        :param nostore_sha: Raise ExistingContent and do not add the lines to
327
            the versioned file if the digest of the lines matches this.
328
        :param random_id: If True a random id has been selected rather than
329
            an id determined by some deterministic process such as a converter
330
            from a foreign VCS. When True the backend may choose not to check
331
            for uniqueness of the resulting key within the versioned file, so
332
            this should only be done when the result is expected to be unique
333
            anyway.
334
        :param check_content: If True, the lines supplied are verified to be
335
            bytestrings that are correctly formed lines.
336
        :return: The text sha1, the number of bytes in the text, and an opaque
337
                 representation of the inserted version which can be provided
338
                 back to future add_lines calls in the parent_texts dictionary.
339
        """
340
        self._index._check_write_ok()
341
        self._check_add(key, lines, random_id, check_content)
342
        if parents is None:
343
            # The caller might pass None if there is no graph data, but kndx
344
            # indexes can't directly store that, so we give them
345
            # an empty tuple instead.
346
            parents = ()
347
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
348
        length = sum(map(len, lines))
349
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
350
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
351
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
352
0.17.4 by Robert Collins
Annotate.
353
    def annotate(self, key):
354
        """See VersionedFiles.annotate."""
355
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
356
        parent_map = self.get_parent_map([key])
357
        if not parent_map:
358
            raise errors.RevisionNotPresent(key, self)
359
        if parent_map[key] is not None:
360
            search = graph._make_breadth_first_searcher([key])
361
            keys = set()
362
            while True:
363
                try:
364
                    present, ghosts = search.next_with_ghosts()
365
                except StopIteration:
366
                    break
367
                keys.update(present)
368
            parent_map = self.get_parent_map(keys)
369
        else:
370
            keys = [key]
371
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
372
        head_cache = _mod_graph.FrozenHeadsCache(graph)
373
        parent_cache = {}
374
        reannotate = annotate.reannotate
375
        for record in self.get_record_stream(keys, 'topological', True):
376
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
377
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
378
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
379
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
380
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
381
        return parent_cache[key]
382
0.17.5 by Robert Collins
nograph tests completely passing.
383
    def check(self, progress_bar=None):
384
        """See VersionedFiles.check()."""
385
        keys = self.keys()
386
        for record in self.get_record_stream(keys, 'unordered', True):
387
            record.get_bytes_as('fulltext')
388
0.17.2 by Robert Collins
Core proof of concept working.
389
    def _check_add(self, key, lines, random_id, check_content):
390
        """check that version_id and lines are safe to add."""
391
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
392
        if version_id is not None:
393
            if contains_whitespace(version_id):
394
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
395
        self.check_not_reserved_id(version_id)
396
        # TODO: If random_id==False and the key is already present, we should
397
        # probably check that the existing content is identical to what is
398
        # being inserted, and otherwise raise an exception.  This would make
399
        # the bundle code simpler.
400
        if check_content:
401
            self._check_lines_not_unicode(lines)
402
            self._check_lines_are_lines(lines)
403
0.17.5 by Robert Collins
nograph tests completely passing.
404
    def get_parent_map(self, keys):
405
        """Get a map of the parents of keys.
406
407
        :param keys: The keys to look up parents for.
408
        :return: A mapping from keys to parents. Absent keys are absent from
409
            the mapping.
410
        """
411
        result = {}
412
        sources = [self._index]
413
        source_results = []
414
        missing = set(keys)
415
        for source in sources:
416
            if not missing:
417
                break
418
            new_result = source.get_parent_map(missing)
419
            source_results.append(new_result)
420
            result.update(new_result)
421
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
422
        if self._unadded_refs:
423
            for key in missing:
424
                if key in self._unadded_refs:
425
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
426
        return result
427
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
428
    def _get_group_and_delta_bytes(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
429
        read_memo = index_memo[0:3]
430
        # get the group:
431
        try:
432
            plain = self._group_cache[read_memo]
433
        except KeyError:
434
            # read the group
435
            zdata = self._access.get_raw_records([read_memo]).next()
436
            # decompress - whole thing - this is not a bug, as it
437
            # permits caching. We might want to store the partially
438
            # decompresed group and decompress object, so that recent
439
            # texts are not penalised by big groups.
440
            plain = zlib.decompress(zdata) #, index_memo[4])
441
            self._group_cache[read_memo] = plain
442
        # cheapo debugging:
443
        # print len(zdata), len(plain)
444
        # parse - requires split_lines, better to have byte offsets
445
        # here (but not by much - we only split the region for the
446
        # recipe, and we often want to end up with lines anyway.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
447
        return plain, plain[index_memo[3]:index_memo[4]]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
448
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
449
    def get_missing_compression_parent_keys(self):
450
        """Return the keys of missing compression parents.
451
452
        Missing compression parents occur when a record stream was missing
453
        basis texts, or a index was scanned that had missing basis texts.
454
        """
455
        # GroupCompress cannot currently reference texts that are not in the
456
        # group, so this is valid for now
457
        return frozenset()
458
0.17.5 by Robert Collins
nograph tests completely passing.
459
    def get_record_stream(self, keys, ordering, include_delta_closure):
460
        """Get a stream of records for keys.
461
462
        :param keys: The keys to include.
463
        :param ordering: Either 'unordered' or 'topological'. A topologically
464
            sorted stream has compression parents strictly before their
465
            children.
466
        :param include_delta_closure: If True then the closure across any
467
            compression parents will be included (in the opaque data).
468
        :return: An iterator of ContentFactory objects, each of which is only
469
            valid until the iterator is advanced.
470
        """
471
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
472
        orig_keys = list(keys)
473
        keys = set(orig_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
474
        if not keys:
475
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
476
        if (not self._index.has_graph
477
            and ordering in ('topological', 'gc-optimal')):
0.17.5 by Robert Collins
nograph tests completely passing.
478
            # Cannot topological order when no graph has been stored.
479
            ordering = 'unordered'
480
        # Cheap: iterate
481
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
482
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
0.17.5 by Robert Collins
nograph tests completely passing.
483
        if ordering == 'topological':
484
            # would be better to not globally sort initially but instead
485
            # start with one key, recurse to its oldest parent, then grab
486
            # everything in the same group, etc.
487
            parent_map = dict((key, details[2]) for key, details in
488
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
489
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
490
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
491
            present_keys = topo_sort(parent_map)
492
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
493
        elif ordering == 'gc-optimal':
494
            parent_map = dict((key, details[2]) for key, details in
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
495
                              locations.iteritems())
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
496
            for key in local_keys:
497
                parent_map[key] = self._unadded_refs[key]
498
            # XXX: This only optimizes for the target ordering. We may need to
499
            #      balance that with the time it takes to extract ordering, by
500
            #      somehow grouping based on locations[key][0:3]
501
            present_keys = sort_gc_optimal(parent_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
502
        elif ordering == 'as-requested':
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
503
            present_keys = [key for key in orig_keys if key in locations
504
                            or key in local_keys]
0.17.5 by Robert Collins
nograph tests completely passing.
505
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
506
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
507
            # Otherwise we thrash the _group_cache and destroy performance
508
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
509
                # This is the group the bytes are stored in, followed by the
510
                # location in the group
511
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
512
            present_keys = sorted(locations.iterkeys(), key=get_group)
513
            # We don't have an ordering for keys in the in-memory object, but
514
            # lets process the in-memory ones first.
515
            present_keys = list(local_keys) + present_keys
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
516
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
517
        absent_keys = keys.difference(set(locations))
518
        for key in absent_keys:
519
            yield AbsentContentFactory(key)
520
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
521
            if key in self._unadded_refs:
0.20.28 by John Arbash Meinel
Fix typo with the recent lines => chunks rename.
522
                chunks, sha1 = self._compressor.extract(key)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
523
                parents = self._unadded_refs[key]
524
            else:
525
                index_memo, _, parents, (method, _) = locations[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
526
                plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
527
                action, label, sha1, delta = parse(delta_bytes)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
528
                if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
529
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
530
                if action == 'fulltext':
531
                    chunks = [delta]
532
                else:
533
                    # TODO: relax apply_delta so that it can allow source to be
534
                    #       longer than expected
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
535
                    bytes = _groupcompress_pyx.apply_delta(plain, delta)
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
536
                    if bytes is None:
537
                        import pdb; pdb.set_trace()
538
                    chunks = [bytes]
539
                    del bytes
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
540
                if _NO_LABELS:
541
                    sha1 = sha_strings(chunks)
542
                else:
543
                    if sha_strings(chunks) != sha1:
544
                        raise AssertionError('sha1 sum did not match')
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
545
            yield ChunkedContentFactory(key, parents, sha1, chunks)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
546
0.17.5 by Robert Collins
nograph tests completely passing.
547
    def get_sha1s(self, keys):
548
        """See VersionedFiles.get_sha1s()."""
549
        result = {}
550
        for record in self.get_record_stream(keys, 'unordered', True):
551
            if record.sha1 != None:
552
                result[record.key] = record.sha1
553
            else:
554
                if record.storage_kind != 'absent':
555
                    result[record.key] == sha_string(record.get_bytes_as(
556
                        'fulltext'))
557
        return result
558
0.17.2 by Robert Collins
Core proof of concept working.
559
    def insert_record_stream(self, stream):
560
        """Insert a record stream into this container.
561
562
        :param stream: A stream of records to insert. 
563
        :return: None
564
        :seealso VersionedFiles.get_record_stream:
565
        """
0.17.5 by Robert Collins
nograph tests completely passing.
566
        for _ in self._insert_record_stream(stream):
567
            pass
0.17.2 by Robert Collins
Core proof of concept working.
568
0.17.5 by Robert Collins
nograph tests completely passing.
569
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
570
        """Internal core to insert a record stream into this container.
571
572
        This helper function has a different interface than insert_record_stream
573
        to allow add_lines to be minimal, but still return the needed data.
574
575
        :param stream: A stream of records to insert. 
576
        :return: An iterator over the sha1 of the inserted records.
577
        :seealso insert_record_stream:
578
        :seealso add_lines:
579
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
580
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
581
        def get_adapter(adapter_key):
582
            try:
583
                return adapters[adapter_key]
584
            except KeyError:
585
                adapter_factory = adapter_registry.get(adapter_key)
586
                adapter = adapter_factory(self)
587
                adapters[adapter_key] = adapter
588
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
589
        # This will go up to fulltexts for gc to gc fetching, which isn't
590
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
591
        self._compressor = GroupCompressor(self._delta)
592
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
593
        keys_to_add = []
594
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
595
        groups = 1
596
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
597
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
598
            index, start, length = self._access.add_raw_records(
599
                [(None, len(compressed))], compressed)[0]
600
            nodes = []
601
            for key, reads, refs in keys_to_add:
602
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
603
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
604
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
605
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
606
            # Raise an error when a record is missing.
607
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
608
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
609
            try:
0.23.52 by John Arbash Meinel
Use the max_delta flag.
610
                bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
611
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
612
                adapter_key = record.storage_kind, 'fulltext'
613
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
614
                bytes = adapter.get_bytes(record)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
615
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
616
            if len(record.key) > 1:
617
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
618
                if (last_prefix is not None and prefix != last_prefix):
619
                    soft = True
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
620
                    if basis_end > 1024 * 1024 * 2:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
621
                        flush()
622
                        self._compressor = GroupCompressor(self._delta)
623
                        self._unadded_refs = {}
624
                        keys_to_add = []
625
                        basis_end = 0
626
                        groups += 1
627
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
628
            found_sha1, end_point = self._compressor.compress(record.key,
0.23.52 by John Arbash Meinel
Use the max_delta flag.
629
                [bytes], record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
630
            if record.key[-1] is None:
631
                key = record.key[:-1] + ('sha1:' + found_sha1,)
632
            else:
633
                key = record.key
634
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
635
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
636
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
637
                (record.parents,)))
638
            basis_end = end_point
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
639
            if basis_end > 1024 * 1024 * 4:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
640
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
641
                self._compressor = GroupCompressor(self._delta)
642
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
643
                keys_to_add = []
644
                basis_end = 0
645
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
646
        if len(keys_to_add):
647
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
648
        self._compressor = None
649
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
650
651
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
652
        """Iterate over the lines in the versioned files from keys.
653
654
        This may return lines from other keys. Each item the returned
655
        iterator yields is a tuple of a line and a text version that that line
656
        is present in (not introduced in).
657
658
        Ordering of results is in whatever order is most suitable for the
659
        underlying storage format.
660
661
        If a progress bar is supplied, it may be used to indicate progress.
662
        The caller is responsible for cleaning up progress bars (because this
663
        is an iterator).
664
665
        NOTES:
666
         * Lines are normalised by the underlying store: they will all have \n
667
           terminators.
668
         * Lines are returned in arbitrary order.
669
670
        :return: An iterator over (line, key).
671
        """
672
        if pb is None:
673
            pb = progress.DummyProgress()
674
        keys = set(keys)
675
        total = len(keys)
676
        # we don't care about inclusions, the caller cares.
677
        # but we need to setup a list of records to visit.
678
        # we need key, position, length
679
        for key_idx, record in enumerate(self.get_record_stream(keys,
680
            'unordered', True)):
681
            # XXX: todo - optimise to use less than full texts.
682
            key = record.key
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
683
            pb.update('Walking content.', key_idx + 1, total)
0.17.5 by Robert Collins
nograph tests completely passing.
684
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
685
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
686
            lines = split_lines(record.get_bytes_as('fulltext'))
687
            for line in lines:
688
                yield line, key
689
        pb.update('Walking content.', total, total)
690
691
    def keys(self):
692
        """See VersionedFiles.keys."""
693
        if 'evil' in debug.debug_flags:
694
            trace.mutter_callsite(2, "keys scales with size of history")
695
        sources = [self._index]
696
        result = set()
697
        for source in sources:
698
            result.update(source.keys())
699
        return result
700
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
701
702
class _GCGraphIndex(object):
703
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
704
0.17.9 by Robert Collins
Initial stab at repository format support.
705
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
706
        add_callback=None):
707
        """Construct a _GCGraphIndex on a graph_index.
708
709
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
710
        :param is_locked: A callback, returns True if the index is locked and
711
            thus usable.
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
712
        :param parents: If True, record knits parents, if not do not record 
713
            parents.
714
        :param add_callback: If not None, allow additions to the index and call
715
            this callback with a list of added GraphIndex nodes:
716
            [(node, value, node_refs), ...]
717
        """
718
        self._add_callback = add_callback
719
        self._graph_index = graph_index
720
        self._parents = parents
721
        self.has_graph = parents
722
        self._is_locked = is_locked
723
0.17.5 by Robert Collins
nograph tests completely passing.
724
    def add_records(self, records, random_id=False):
725
        """Add multiple records to the index.
726
        
727
        This function does not insert data into the Immutable GraphIndex
728
        backing the KnitGraphIndex, instead it prepares data for insertion by
729
        the caller and checks that it is safe to insert then calls
730
        self._add_callback with the prepared GraphIndex nodes.
731
732
        :param records: a list of tuples:
733
                         (key, options, access_memo, parents).
734
        :param random_id: If True the ids being added were randomly generated
735
            and no check for existence will be performed.
736
        """
737
        if not self._add_callback:
738
            raise errors.ReadOnlyError(self)
739
        # we hope there are no repositories with inconsistent parentage
740
        # anymore.
741
742
        changed = False
743
        keys = {}
744
        for (key, value, refs) in records:
745
            if not self._parents:
746
                if refs:
747
                    for ref in refs:
748
                        if ref:
749
                            raise KnitCorrupt(self,
750
                                "attempt to add node with parents "
751
                                "in parentless index.")
752
                    refs = ()
753
                    changed = True
754
            keys[key] = (value, refs)
755
        # check for dups
756
        if not random_id:
757
            present_nodes = self._get_entries(keys)
758
            for (index, key, value, node_refs) in present_nodes:
759
                if node_refs != keys[key][1]:
760
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
761
                        ": %s %s" % ((value, node_refs), keys[key]))
762
                del keys[key]
763
                changed = True
764
        if changed:
765
            result = []
766
            if self._parents:
767
                for key, (value, node_refs) in keys.iteritems():
768
                    result.append((key, value, node_refs))
769
            else:
770
                for key, (value, node_refs) in keys.iteritems():
771
                    result.append((key, value))
772
            records = result
773
        self._add_callback(records)
774
        
775
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
776
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
777
        if not self._is_locked():
778
            raise errors.ObjectNotLocked(self)
779
0.17.2 by Robert Collins
Core proof of concept working.
780
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
781
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
782
        if not self._is_locked():
783
            raise errors.ObjectNotLocked(self)
784
0.17.5 by Robert Collins
nograph tests completely passing.
785
    def _get_entries(self, keys, check_present=False):
786
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
787
788
        Note: Callers are responsible for checking that the index is locked
789
        before calling this method.
790
0.17.5 by Robert Collins
nograph tests completely passing.
791
        :param keys: An iterable of index key tuples.
792
        """
793
        keys = set(keys)
794
        found_keys = set()
795
        if self._parents:
796
            for node in self._graph_index.iter_entries(keys):
797
                yield node
798
                found_keys.add(node[1])
799
        else:
800
            # adapt parentless index to the rest of the code.
801
            for node in self._graph_index.iter_entries(keys):
802
                yield node[0], node[1], node[2], ()
803
                found_keys.add(node[1])
804
        if check_present:
805
            missing_keys = keys.difference(found_keys)
806
            if missing_keys:
807
                raise RevisionNotPresent(missing_keys.pop(), self)
808
809
    def get_parent_map(self, keys):
810
        """Get a map of the parents of keys.
811
812
        :param keys: The keys to look up parents for.
813
        :return: A mapping from keys to parents. Absent keys are absent from
814
            the mapping.
815
        """
816
        self._check_read()
817
        nodes = self._get_entries(keys)
818
        result = {}
819
        if self._parents:
820
            for node in nodes:
821
                result[node[1]] = node[3][0]
822
        else:
823
            for node in nodes:
824
                result[node[1]] = None
825
        return result
826
827
    def get_build_details(self, keys):
828
        """Get the various build details for keys.
829
830
        Ghosts are omitted from the result.
831
832
        :param keys: An iterable of keys.
833
        :return: A dict of key:
834
            (index_memo, compression_parent, parents, record_details).
835
            index_memo
836
                opaque structure to pass to read_records to extract the raw
837
                data
838
            compression_parent
839
                Content that this record is built upon, may be None
840
            parents
841
                Logical parents of this node
842
            record_details
843
                extra information about the content which needs to be passed to
844
                Factory.parse_record
845
        """
846
        self._check_read()
847
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
848
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
849
        for entry in entries:
850
            key = entry[1]
851
            if not self._parents:
852
                parents = None
853
            else:
854
                parents = entry[3][0]
855
            method = 'group'
856
            result[key] = (self._node_to_position(entry),
857
                                  None, parents, (method, None))
858
        return result
859
    
860
    def keys(self):
861
        """Get all the keys in the collection.
862
        
863
        The keys are not ordered.
864
        """
865
        self._check_read()
866
        return [node[1] for node in self._graph_index.iter_all_entries()]
867
    
868
    def _node_to_position(self, node):
869
        """Convert an index value to position details."""
870
        bits = node[2].split(' ')
871
        # It would be nice not to read the entire gzip.
872
        start = int(bits[0])
873
        stop = int(bits[1])
874
        basis_end = int(bits[2])
875
        delta_end = int(bits[3])
876
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
877
878
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
879
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
880
    """Get the longest possible match for the current position."""
881
    range_start = pos
882
    range_len = 0
883
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
884
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
885
        if locations is None:
886
            locations = equivalence_table.get_idx_matches(pos)
887
        if locations is None:
888
            # No more matches, just return whatever we have, but we know that
889
            # this last position is not going to match anything
890
            pos += 1
891
            break
892
        else:
893
            if copy_ends is None:
894
                # We are starting a new range
895
                copy_ends = [loc + 1 for loc in locations]
896
                range_len = 1
897
                locations = None # Consumed
898
            else:
899
                # We are currently in the middle of a match
900
                next_locations = set(copy_ends).intersection(locations)
901
                if len(next_locations):
902
                    # range continues
903
                    copy_ends = [loc + 1 for loc in next_locations]
904
                    range_len += 1
905
                    locations = None # Consumed
906
                else:
907
                    # But we are done with this match, we should be
908
                    # starting a new one, though. We will pass back 'locations'
909
                    # so that we don't have to do another lookup.
910
                    break
911
        pos += 1
912
    if copy_ends is None:
913
        return None, pos, locations
914
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
915
916
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
917
try:
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
918
    from bzrlib.plugins.groupcompress_rabin import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
919
except ImportError:
920
    pass