~bzr-pqm/bzr/bzr.dev

0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.17.9 by Robert Collins
Initial stab at repository format support.
43
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
44
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
45
    adapter_registry,
46
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
47
    FulltextContentFactory,
48
    VersionedFiles,
49
    )
50
51
0.17.5 by Robert Collins
nograph tests completely passing.
52
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
53
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
54
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
55
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
56
    label_line = lines.next()
57
    sha1_line = lines.next()
58
    if (not label_line.startswith('label: ') or
59
        not sha1_line.startswith('sha1: ')):
60
        raise AssertionError("bad text record %r" % lines)
61
    label = tuple(label_line[7:-1].split('\x00'))
62
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
63
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
64
        op = header[0]
65
        numbers = header[2:]
66
        numbers = [int(n) for n in header[2:].split(',')]
67
        if op == 'c':
68
            result.append((op, numbers[0], numbers[1], None))
69
        else:
70
            contents = [next() for i in xrange(numbers[0])]
71
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
72
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
73
74
def apply_delta(basis, delta):
75
    """Apply delta to this object to become new_version_id."""
76
    lines = []
77
    last_offset = 0
78
    # eq ranges occur where gaps occur
79
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
80
    for op, start, count, delta_lines in delta:
81
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
82
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
83
        else:
84
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
85
    trim_encoding_newline(lines)
86
    return lines
87
88
89
def trim_encoding_newline(lines):
90
    if lines[-1] == '\n':
91
        del lines[-1]
92
    else:
93
        lines[-1] = lines[-1][:-1]
94
95
96
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
97
    """Produce a serialised group of compressed texts.
98
    
99
    It contains code very similar to SequenceMatcher because of having a similar
100
    task. However some key differences apply:
101
     - there is no junk, we want a minimal edit not a human readable diff.
102
     - we don't filter very common lines (because we don't know where a good
103
       range will start, and after the first text we want to be emitting minmal
104
       edits only.
105
     - we chain the left side, not the right side
106
     - we incrementally update the adjacency matrix as new lines are provided.
107
     - we look for matches in all of the left side, so the routine which does
108
       the analagous task of find_longest_match does not need to filter on the
109
       left side.
110
    """
0.17.2 by Robert Collins
Core proof of concept working.
111
112
    def __init__(self, delta=True):
113
        """Create a GroupCompressor.
114
115
        :paeam delta: If False, do not compress records.
116
        """
117
        self._delta = delta
118
        self.lines = []
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
119
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
120
        self.endpoint = 0
121
        self.input_bytes = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
122
        # line: set(locations it appears at), set(N+1 for N in locations)
123
        self.line_locations = {}
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
124
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
125
126
    def compress(self, key, lines, expected_sha):
127
        """Compress lines with label key.
128
129
        :param key: A key tuple. It is stored in the output
130
            for identification of the text during decompression.
131
        :param lines: The lines to be compressed. Must be split
132
            on \n, with the \n preserved.'
133
        :param expected_sha: If non-None, the sha the lines are blieved to
134
            have. During compression the sha is calculated; a mismatch will
135
            cause an error.
136
        :return: The sha1 of lines, and the number of bytes accumulated in
137
            the group output so far.
138
        """
139
        sha1 = sha_strings(lines)
140
        label = '\x00'.join(key)
141
        # setup good encoding for trailing \n support.
142
        if not lines or lines[-1].endswith('\n'):
143
            lines.append('\n')
144
        else:
145
            lines[-1] = lines[-1] + '\n'
146
        new_lines = []
147
        new_lines.append('label: %s\n' % label)
148
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
149
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
150
        pos = 0
151
        line_locations = self.line_locations
152
        accumulator = []
153
        copying = False
154
        new_len = 0
155
        new_start = 0
156
        # We either copy a range (while there are reusable lines) or we 
157
        # insert new lines. To find reusable lines we traverse 
158
        while pos < len(lines):
159
            line = lines[pos]
160
            if line not in line_locations:
161
                if copying:
162
                    # flush the copy
163
                    copy_start = min(copy_ends) - copy_len
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
164
                    stop_byte = self.line_offsets[copy_start + copy_len - 1]
165
                    if copy_start == 0:
166
                        start_byte = 0
167
                    else:
168
                        start_byte = self.line_offsets[copy_start - 1]
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
169
                    bytes = stop_byte - start_byte
170
                    copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
171
                    insert_instruction = "i,%d\n" % copy_len
172
                    if (bytes + len(insert_instruction) >
173
                        len(copy_control_instruction)):
174
                        new_lines.append(copy_control_instruction)
175
                        index_lines.append(False)
176
                    else:
177
                        # inserting is shorter than copying, so insert.
178
                        new_lines.append(insert_instruction)
179
                        new_lines.extend(lines[new_start:new_start+copy_len])
180
                        index_lines.extend([False]*(copy_len + 1))
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
181
                    copying = False
182
                    new_start = pos
183
                    new_len = 1
184
                else:
185
                    new_len += 1
186
            else:
187
                if copying:
188
                    locations, next = line_locations[line]
189
                    next_locations = locations.intersection(copy_ends)
190
                    if len(next_locations):
191
                        # range continues
192
                        copy_len += 1
193
                        copy_ends = set(loc + 1 for loc in next_locations)
194
                    else:
195
                        # range stops, flush and start a new copy range
196
                        copy_start = min(copy_ends) - copy_len
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
197
                        stop_byte = self.line_offsets[copy_start + copy_len - 1]
198
                        if copy_start == 0:
199
                            start_byte = 0
200
                        else:
201
                            start_byte = self.line_offsets[copy_start - 1]
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
202
                        bytes = stop_byte - start_byte
203
                        copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
204
                        insert_instruction = "i,%d\n" % copy_len
205
                        if (bytes + len(insert_instruction) >
206
                            len(copy_control_instruction)):
207
                            new_lines.append(copy_control_instruction)
208
                            index_lines.append(False)
209
                        else:
210
                            # inserting is shorter than copying, so insert.
211
                            new_lines.append(insert_instruction)
212
                            new_lines.extend(lines[new_start:new_start+copy_len])
213
                            index_lines.extend([False]*(copy_len + 1))
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
214
                        copy_len = 1
215
                        copy_ends = next
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
216
                        new_start = pos
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
217
                else:
218
                    # Flush
219
                    if new_len:
220
                        new_lines.append("i,%d\n" % new_len)
221
                        new_lines.extend(lines[new_start:new_start+new_len])
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
222
                        index_lines.append(False)
223
                        index_lines.extend([True]*new_len)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
224
                    # setup a copy
225
                    copy_len = 1
226
                    copy_ends = line_locations[line][1]
227
                    copying = True
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
228
                    new_start = pos
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
229
            pos += 1
230
        if copying:
231
            copy_start = min(copy_ends) - copy_len
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
232
            stop_byte = self.line_offsets[copy_start + copy_len - 1]
233
            if copy_start == 0:
234
                start_byte = 0
235
            else:
236
                start_byte = self.line_offsets[copy_start - 1]
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
237
            bytes = stop_byte - start_byte
238
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
239
            insert_instruction = "i,%d\n" % copy_len
240
            if (bytes + len(insert_instruction) >
241
                len(copy_control_instruction)):
242
                new_lines.append(copy_control_instruction)
243
                index_lines.append(False)
244
            else:
245
                # inserting is shorter than copying, so insert.
246
                new_lines.append(insert_instruction)
247
                new_lines.extend(lines[new_start:new_start+copy_len])
248
                index_lines.extend([False]*(copy_len + 1))
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
249
        elif new_len:
250
            new_lines.append("i,%d\n" % new_len)
251
            new_lines.extend(lines[new_start:new_start+new_len])
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
252
            index_lines.append(False)
253
            index_lines.extend([True]*new_len)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
254
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
255
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
256
        trim_encoding_newline(lines)
257
        self.input_bytes += sum(map(len, lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
258
        delta_end = (self.endpoint, len(self.lines))
259
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
260
        return sha1, self.endpoint
261
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
262
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
263
        """Extract a key previously added to the compressor.
264
        
265
        :param key: The key to extract.
266
        :return: An iterable over bytes and the sha1.
267
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
268
        delta_details = self.labels_deltas[key]
269
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
270
        label, sha1, delta = parse(delta_lines)
271
        if label != key:
272
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
273
        # Perhaps we want to keep the line offsets too in memory at least?
274
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
275
        sha1 = sha_strings(lines)
276
        return lines, sha1
277
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
278
    def output_lines(self, new_lines, index_lines):
279
        """Output some lines.
280
281
        :param new_lines: The lines to output.
282
        :param index_lines: A boolean flag for each line - when True, index
283
            that line.
284
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
285
        endpoint = self.endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
286
        offset = len(self.lines)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
287
        for (pos, line), index in izip(enumerate(new_lines), index_lines):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
288
            self.lines.append(line)
289
            endpoint += len(line)
290
            self.line_offsets.append(endpoint)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
291
            if index:
292
                indices, next_lines = self.line_locations.setdefault(line,
293
                    (set(), set()))
294
                indices.add(pos + offset)
295
                next_lines.add(pos + offset + 1)
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
296
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
297
0.17.2 by Robert Collins
Core proof of concept working.
298
    def ratio(self):
299
        """Return the overall compression ratio."""
300
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
301
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
302
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
303
def make_pack_factory(graph, delta, keylength):
304
    """Create a factory for creating a pack based groupcompress.
305
306
    This is only functional enough to run interface tests, it doesn't try to
307
    provide a full pack environment.
308
    
309
    :param graph: Store a graph.
310
    :param delta: Delta compress contents.
311
    :param keylength: How long should keys be.
312
    """
313
    def factory(transport):
314
        parents = graph or delta
315
        ref_length = 0
316
        if graph:
317
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
318
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
319
            key_elements=keylength)
320
        stream = transport.open_write_stream('newpack')
321
        writer = pack.ContainerWriter(stream.write)
322
        writer.begin()
323
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
324
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
325
        access = _DirectPackAccess({})
326
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
327
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
328
        result.stream = stream
329
        result.writer = writer
330
        return result
331
    return factory
332
333
334
def cleanup_pack_group(versioned_files):
335
    versioned_files.stream.close()
336
    versioned_files.writer.end()
337
338
339
class GroupCompressVersionedFiles(VersionedFiles):
340
    """A group-compress based VersionedFiles implementation."""
341
0.17.2 by Robert Collins
Core proof of concept working.
342
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
343
        """Create a GroupCompressVersionedFiles object.
344
345
        :param index: The index object storing access and graph data.
346
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
347
        :param delta: Whether to delta compress or just entropy compress.
348
        """
349
        self._index = index
350
        self._access = access
351
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
352
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
353
354
    def add_lines(self, key, parents, lines, parent_texts=None,
355
        left_matching_blocks=None, nostore_sha=None, random_id=False,
356
        check_content=True):
357
        """Add a text to the store.
358
359
        :param key: The key tuple of the text to add.
360
        :param parents: The parents key tuples of the text to add.
361
        :param lines: A list of lines. Each line must be a bytestring. And all
362
            of them except the last must be terminated with \n and contain no
363
            other \n's. The last line may either contain no \n's or a single
364
            terminating \n. If the lines list does meet this constraint the add
365
            routine may error or may succeed - but you will be unable to read
366
            the data back accurately. (Checking the lines have been split
367
            correctly is expensive and extremely unlikely to catch bugs so it
368
            is not done at runtime unless check_content is True.)
369
        :param parent_texts: An optional dictionary containing the opaque 
370
            representations of some or all of the parents of version_id to
371
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
372
            returned by add_lines or data corruption can be caused.
373
        :param left_matching_blocks: a hint about which areas are common
374
            between the text and its left-hand-parent.  The format is
375
            the SequenceMatcher.get_matching_blocks format.
376
        :param nostore_sha: Raise ExistingContent and do not add the lines to
377
            the versioned file if the digest of the lines matches this.
378
        :param random_id: If True a random id has been selected rather than
379
            an id determined by some deterministic process such as a converter
380
            from a foreign VCS. When True the backend may choose not to check
381
            for uniqueness of the resulting key within the versioned file, so
382
            this should only be done when the result is expected to be unique
383
            anyway.
384
        :param check_content: If True, the lines supplied are verified to be
385
            bytestrings that are correctly formed lines.
386
        :return: The text sha1, the number of bytes in the text, and an opaque
387
                 representation of the inserted version which can be provided
388
                 back to future add_lines calls in the parent_texts dictionary.
389
        """
390
        self._index._check_write_ok()
391
        self._check_add(key, lines, random_id, check_content)
392
        if parents is None:
393
            # The caller might pass None if there is no graph data, but kndx
394
            # indexes can't directly store that, so we give them
395
            # an empty tuple instead.
396
            parents = ()
397
        # double handling for now. Make it work until then.
398
        bytes = ''.join(lines)
399
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
400
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
401
        return sha1, len(bytes), None
402
0.17.4 by Robert Collins
Annotate.
403
    def annotate(self, key):
404
        """See VersionedFiles.annotate."""
405
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
406
        parent_map = self.get_parent_map([key])
407
        if not parent_map:
408
            raise errors.RevisionNotPresent(key, self)
409
        if parent_map[key] is not None:
410
            search = graph._make_breadth_first_searcher([key])
411
            keys = set()
412
            while True:
413
                try:
414
                    present, ghosts = search.next_with_ghosts()
415
                except StopIteration:
416
                    break
417
                keys.update(present)
418
            parent_map = self.get_parent_map(keys)
419
        else:
420
            keys = [key]
421
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
422
        head_cache = _mod_graph.FrozenHeadsCache(graph)
423
        parent_cache = {}
424
        reannotate = annotate.reannotate
425
        for record in self.get_record_stream(keys, 'topological', True):
426
            key = record.key
427
            fulltext = split_lines(record.get_bytes_as('fulltext'))
428
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
429
            parent_cache[key] = list(
430
                reannotate(parent_lines, fulltext, key, None, head_cache))
431
        return parent_cache[key]
432
0.17.5 by Robert Collins
nograph tests completely passing.
433
    def check(self, progress_bar=None):
434
        """See VersionedFiles.check()."""
435
        keys = self.keys()
436
        for record in self.get_record_stream(keys, 'unordered', True):
437
            record.get_bytes_as('fulltext')
438
0.17.2 by Robert Collins
Core proof of concept working.
439
    def _check_add(self, key, lines, random_id, check_content):
440
        """check that version_id and lines are safe to add."""
441
        version_id = key[-1]
442
        if contains_whitespace(version_id):
443
            raise InvalidRevisionId(version_id, self)
444
        self.check_not_reserved_id(version_id)
445
        # TODO: If random_id==False and the key is already present, we should
446
        # probably check that the existing content is identical to what is
447
        # being inserted, and otherwise raise an exception.  This would make
448
        # the bundle code simpler.
449
        if check_content:
450
            self._check_lines_not_unicode(lines)
451
            self._check_lines_are_lines(lines)
452
0.17.5 by Robert Collins
nograph tests completely passing.
453
    def get_parent_map(self, keys):
454
        """Get a map of the parents of keys.
455
456
        :param keys: The keys to look up parents for.
457
        :return: A mapping from keys to parents. Absent keys are absent from
458
            the mapping.
459
        """
460
        result = {}
461
        sources = [self._index]
462
        source_results = []
463
        missing = set(keys)
464
        for source in sources:
465
            if not missing:
466
                break
467
            new_result = source.get_parent_map(missing)
468
            source_results.append(new_result)
469
            result.update(new_result)
470
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
471
        if self._unadded_refs:
472
            for key in missing:
473
                if key in self._unadded_refs:
474
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
475
        return result
476
477
    def get_record_stream(self, keys, ordering, include_delta_closure):
478
        """Get a stream of records for keys.
479
480
        :param keys: The keys to include.
481
        :param ordering: Either 'unordered' or 'topological'. A topologically
482
            sorted stream has compression parents strictly before their
483
            children.
484
        :param include_delta_closure: If True then the closure across any
485
            compression parents will be included (in the opaque data).
486
        :return: An iterator of ContentFactory objects, each of which is only
487
            valid until the iterator is advanced.
488
        """
489
        # keys might be a generator
490
        keys = set(keys)
491
        if not keys:
492
            return
493
        if not self._index.has_graph:
494
            # Cannot topological order when no graph has been stored.
495
            ordering = 'unordered'
496
        # Cheap: iterate
497
        locations = self._index.get_build_details(keys)
498
        if ordering == 'topological':
499
            # would be better to not globally sort initially but instead
500
            # start with one key, recurse to its oldest parent, then grab
501
            # everything in the same group, etc.
502
            parent_map = dict((key, details[2]) for key, details in
503
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
504
            local = frozenset(keys).intersection(set(self._unadded_refs))
505
            for key in local:
506
                parent_map[key] = self._unadded_refs[key]
507
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
508
            present_keys = topo_sort(parent_map)
509
            # Now group by source:
510
        else:
511
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
512
            local = frozenset(keys).intersection(set(self._unadded_refs))
513
            for key in local:
514
                present_keys.append(key)
515
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
516
        absent_keys = keys.difference(set(locations))
517
        for key in absent_keys:
518
            yield AbsentContentFactory(key)
519
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
520
            if key in self._unadded_refs:
521
                lines, sha1 = self._compressor.extract(key)
522
                parents = self._unadded_refs[key]
523
            else:
524
                index_memo, _, parents, (method, _) = locations[key]
525
                # read
526
                read_memo = index_memo[0:3]
527
                zdata = self._access.get_raw_records([read_memo]).next()
528
                # decompress
529
                plain = zlib.decompress(zdata)
530
                # parse
531
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
532
                label, sha1, delta = parse(delta_lines)
533
                if label != key:
534
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
535
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
536
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
537
                #basis = split_lines(plain[:last_end])
538
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
539
            bytes = ''.join(lines)
540
            yield FulltextContentFactory(key, parents, sha1, bytes)
541
            
542
    def get_sha1s(self, keys):
543
        """See VersionedFiles.get_sha1s()."""
544
        result = {}
545
        for record in self.get_record_stream(keys, 'unordered', True):
546
            if record.sha1 != None:
547
                result[record.key] = record.sha1
548
            else:
549
                if record.storage_kind != 'absent':
550
                    result[record.key] == sha_string(record.get_bytes_as(
551
                        'fulltext'))
552
        return result
553
0.17.2 by Robert Collins
Core proof of concept working.
554
    def insert_record_stream(self, stream):
555
        """Insert a record stream into this container.
556
557
        :param stream: A stream of records to insert. 
558
        :return: None
559
        :seealso VersionedFiles.get_record_stream:
560
        """
0.17.5 by Robert Collins
nograph tests completely passing.
561
        for _ in self._insert_record_stream(stream):
562
            pass
0.17.2 by Robert Collins
Core proof of concept working.
563
0.17.5 by Robert Collins
nograph tests completely passing.
564
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
565
        """Internal core to insert a record stream into this container.
566
567
        This helper function has a different interface than insert_record_stream
568
        to allow add_lines to be minimal, but still return the needed data.
569
570
        :param stream: A stream of records to insert. 
571
        :return: An iterator over the sha1 of the inserted records.
572
        :seealso insert_record_stream:
573
        :seealso add_lines:
574
        """
0.17.5 by Robert Collins
nograph tests completely passing.
575
        def get_adapter(adapter_key):
576
            try:
577
                return adapters[adapter_key]
578
            except KeyError:
579
                adapter_factory = adapter_registry.get(adapter_key)
580
                adapter = adapter_factory(self)
581
                adapters[adapter_key] = adapter
582
                return adapter
583
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
584
        # This will go up to fulltexts for gc to gc fetching, which isn't
585
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
586
        self._compressor = GroupCompressor(self._delta)
587
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
588
        keys_to_add = []
589
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
590
        groups = 1
591
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
592
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
593
            index, start, length = self._access.add_raw_records(
594
                [(None, len(compressed))], compressed)[0]
595
            nodes = []
596
            for key, reads, refs in keys_to_add:
597
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
598
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
599
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
600
            # Raise an error when a record is missing.
601
            if record.storage_kind == 'absent':
602
                raise errors.RevisionNotPresent([record.key], self)
603
            elif record.storage_kind == 'fulltext':
604
                bytes = record.get_bytes_as('fulltext')
605
            else:
606
                adapter_key = record.storage_kind, 'fulltext'
607
                adapter = get_adapter(adapter_key)
608
                bytes = adapter.get_bytes(record,
609
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
610
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
611
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
612
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
613
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
614
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
615
                (record.parents,)))
616
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
617
            if basis_end > 1024 * 1024 * 20:
618
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
619
                self._compressor = GroupCompressor(self._delta)
620
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
621
                keys_to_add = []
622
                basis_end = 0
623
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
624
        if len(keys_to_add):
625
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
626
        self._compressor = None
627
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
628
629
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
630
        """Iterate over the lines in the versioned files from keys.
631
632
        This may return lines from other keys. Each item the returned
633
        iterator yields is a tuple of a line and a text version that that line
634
        is present in (not introduced in).
635
636
        Ordering of results is in whatever order is most suitable for the
637
        underlying storage format.
638
639
        If a progress bar is supplied, it may be used to indicate progress.
640
        The caller is responsible for cleaning up progress bars (because this
641
        is an iterator).
642
643
        NOTES:
644
         * Lines are normalised by the underlying store: they will all have \n
645
           terminators.
646
         * Lines are returned in arbitrary order.
647
648
        :return: An iterator over (line, key).
649
        """
650
        if pb is None:
651
            pb = progress.DummyProgress()
652
        keys = set(keys)
653
        total = len(keys)
654
        # we don't care about inclusions, the caller cares.
655
        # but we need to setup a list of records to visit.
656
        # we need key, position, length
657
        for key_idx, record in enumerate(self.get_record_stream(keys,
658
            'unordered', True)):
659
            # XXX: todo - optimise to use less than full texts.
660
            key = record.key
661
            pb.update('Walking content.', key_idx, total)
662
            if record.storage_kind == 'absent':
663
                raise errors.RevisionNotPresent(record.key, self)
664
            lines = split_lines(record.get_bytes_as('fulltext'))
665
            for line in lines:
666
                yield line, key
667
        pb.update('Walking content.', total, total)
668
669
    def keys(self):
670
        """See VersionedFiles.keys."""
671
        if 'evil' in debug.debug_flags:
672
            trace.mutter_callsite(2, "keys scales with size of history")
673
        sources = [self._index]
674
        result = set()
675
        for source in sources:
676
            result.update(source.keys())
677
        return result
678
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
679
680
class _GCGraphIndex(object):
681
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
682
0.17.9 by Robert Collins
Initial stab at repository format support.
683
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
684
        add_callback=None):
685
        """Construct a _GCGraphIndex on a graph_index.
686
687
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
688
        :param is_locked: A callback to check whether the object should answer
689
            queries.
690
        :param parents: If True, record knits parents, if not do not record 
691
            parents.
692
        :param add_callback: If not None, allow additions to the index and call
693
            this callback with a list of added GraphIndex nodes:
694
            [(node, value, node_refs), ...]
695
        :param is_locked: A callback, returns True if the index is locked and
696
            thus usable.
697
        """
698
        self._add_callback = add_callback
699
        self._graph_index = graph_index
700
        self._parents = parents
701
        self.has_graph = parents
702
        self._is_locked = is_locked
703
0.17.5 by Robert Collins
nograph tests completely passing.
704
    def add_records(self, records, random_id=False):
705
        """Add multiple records to the index.
706
        
707
        This function does not insert data into the Immutable GraphIndex
708
        backing the KnitGraphIndex, instead it prepares data for insertion by
709
        the caller and checks that it is safe to insert then calls
710
        self._add_callback with the prepared GraphIndex nodes.
711
712
        :param records: a list of tuples:
713
                         (key, options, access_memo, parents).
714
        :param random_id: If True the ids being added were randomly generated
715
            and no check for existence will be performed.
716
        """
717
        if not self._add_callback:
718
            raise errors.ReadOnlyError(self)
719
        # we hope there are no repositories with inconsistent parentage
720
        # anymore.
721
722
        changed = False
723
        keys = {}
724
        for (key, value, refs) in records:
725
            if not self._parents:
726
                if refs:
727
                    for ref in refs:
728
                        if ref:
729
                            raise KnitCorrupt(self,
730
                                "attempt to add node with parents "
731
                                "in parentless index.")
732
                    refs = ()
733
                    changed = True
734
            keys[key] = (value, refs)
735
        # check for dups
736
        if not random_id:
737
            present_nodes = self._get_entries(keys)
738
            for (index, key, value, node_refs) in present_nodes:
739
                if node_refs != keys[key][1]:
740
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
741
                        ": %s %s" % ((value, node_refs), keys[key]))
742
                del keys[key]
743
                changed = True
744
        if changed:
745
            result = []
746
            if self._parents:
747
                for key, (value, node_refs) in keys.iteritems():
748
                    result.append((key, value, node_refs))
749
            else:
750
                for key, (value, node_refs) in keys.iteritems():
751
                    result.append((key, value))
752
            records = result
753
        self._add_callback(records)
754
        
755
    def _check_read(self):
756
        """raise if reads are not permitted."""
757
        if not self._is_locked():
758
            raise errors.ObjectNotLocked(self)
759
0.17.2 by Robert Collins
Core proof of concept working.
760
    def _check_write_ok(self):
761
        """Assert if writes are not permitted."""
762
        if not self._is_locked():
763
            raise errors.ObjectNotLocked(self)
764
0.17.5 by Robert Collins
nograph tests completely passing.
765
    def _get_entries(self, keys, check_present=False):
766
        """Get the entries for keys.
767
        
768
        :param keys: An iterable of index key tuples.
769
        """
770
        keys = set(keys)
771
        found_keys = set()
772
        if self._parents:
773
            for node in self._graph_index.iter_entries(keys):
774
                yield node
775
                found_keys.add(node[1])
776
        else:
777
            # adapt parentless index to the rest of the code.
778
            for node in self._graph_index.iter_entries(keys):
779
                yield node[0], node[1], node[2], ()
780
                found_keys.add(node[1])
781
        if check_present:
782
            missing_keys = keys.difference(found_keys)
783
            if missing_keys:
784
                raise RevisionNotPresent(missing_keys.pop(), self)
785
786
    def get_parent_map(self, keys):
787
        """Get a map of the parents of keys.
788
789
        :param keys: The keys to look up parents for.
790
        :return: A mapping from keys to parents. Absent keys are absent from
791
            the mapping.
792
        """
793
        self._check_read()
794
        nodes = self._get_entries(keys)
795
        result = {}
796
        if self._parents:
797
            for node in nodes:
798
                result[node[1]] = node[3][0]
799
        else:
800
            for node in nodes:
801
                result[node[1]] = None
802
        return result
803
804
    def get_build_details(self, keys):
805
        """Get the various build details for keys.
806
807
        Ghosts are omitted from the result.
808
809
        :param keys: An iterable of keys.
810
        :return: A dict of key:
811
            (index_memo, compression_parent, parents, record_details).
812
            index_memo
813
                opaque structure to pass to read_records to extract the raw
814
                data
815
            compression_parent
816
                Content that this record is built upon, may be None
817
            parents
818
                Logical parents of this node
819
            record_details
820
                extra information about the content which needs to be passed to
821
                Factory.parse_record
822
        """
823
        self._check_read()
824
        result = {}
825
        entries = self._get_entries(keys, False)
826
        for entry in entries:
827
            key = entry[1]
828
            if not self._parents:
829
                parents = None
830
            else:
831
                parents = entry[3][0]
832
            value = entry[2]
833
            method = 'group'
834
            result[key] = (self._node_to_position(entry),
835
                                  None, parents, (method, None))
836
        return result
837
    
838
    def keys(self):
839
        """Get all the keys in the collection.
840
        
841
        The keys are not ordered.
842
        """
843
        self._check_read()
844
        return [node[1] for node in self._graph_index.iter_all_entries()]
845
    
846
    def _node_to_position(self, node):
847
        """Convert an index value to position details."""
848
        bits = node[2].split(' ')
849
        # It would be nice not to read the entire gzip.
850
        start = int(bits[0])
851
        stop = int(bits[1])
852
        basis_end = int(bits[2])
853
        delta_end = int(bits[3])
854
        return node[0], start, stop, basis_end, delta_end