~bzr-pqm/bzr/bzr.dev

0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
22
import struct
0.17.5 by Robert Collins
nograph tests completely passing.
23
import zlib
24
0.17.4 by Robert Collins
Annotate.
25
from bzrlib import (
26
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
27
    debug,
0.17.4 by Robert Collins
Annotate.
28
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
29
    errors,
0.17.4 by Robert Collins
Annotate.
30
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
31
    osutils,
0.17.4 by Robert Collins
Annotate.
32
    pack,
33
    patiencediff,
34
    )
35
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
36
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
37
from bzrlib.osutils import (
38
    contains_whitespace,
39
    contains_linebreaks,
40
    sha_string,
41
    sha_strings,
42
    split_lines,
43
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
44
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
45
from bzrlib.lru_cache import LRUSizeCache
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
54
from bzrlib.plugins.groupcompress import errors as gc_errors
0.17.2 by Robert Collins
Core proof of concept working.
55
0.23.55 by John Arbash Meinel
Make sure the default is _FAST=False for now.
56
_FAST = False
0.17.2 by Robert Collins
Core proof of concept working.
57
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
58
def encode_base128_int(val):
59
    """Convert an integer into a 7-bit lsb encoding."""
60
    bytes = []
61
    count = 0
62
    while val >= 0x80:
63
        bytes.append(chr((val | 0x80) & 0xFF))
64
        val >>= 7
65
    bytes.append(chr(val))
66
    return ''.join(bytes)
67
68
69
def decode_base128_int(bytes):
70
    """Decode an integer from a 7-bit lsb encoding."""
71
    offset = 0
72
    val = 0
73
    shift = 0
74
    bval = ord(bytes[offset])
75
    while bval >= 0x80:
76
        val |= (bval & 0x7F) << shift
77
        shift += 7
78
        offset += 1
79
        bval = ord(bytes[offset])
80
    val |= bval << shift
81
    offset += 1
82
    return val, offset
83
84
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
85
def sort_gc_optimal(parent_map):
86
    """Sort and group the keys in parent_map into gc-optimal order.
87
88
    gc-optimal is defined (currently) as reverse-topological order, grouped by
89
    the key prefix.
90
91
    :return: A sorted-list of keys
92
    """
93
    # gc-optimal ordering is approximately reverse topological,
94
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
95
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
96
    for item in parent_map.iteritems():
97
        key = item[0]
98
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
99
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
100
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
101
            prefix = key[0]
102
        try:
103
            per_prefix_map[prefix].append(item)
104
        except KeyError:
105
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
106
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
107
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
108
    for prefix in sorted(per_prefix_map):
109
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
110
    return present_keys
111
112
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
113
class GroupCompressBlockEntry(object):
114
    """Track the information about a single object inside a GC group.
115
116
    This is generally just the dumb data structure.
117
    """
118
119
    def __init__(self, key, type, sha1, start, length):
120
        self.key = key
121
        self.type = type # delta, fulltext, external?
122
        self.sha1 = sha1 # Sha1 of content
123
        self.start = start # Byte offset to start of data
124
        self.length = length # Length of content
125
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
126
    def __repr__(self):
127
        return '%s(%s, %s, %s, %s, %s)' % (
128
            self.__class__.__name__,
129
            self.key, self.type, self.sha1, self.start, self.length
130
            )
131
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
132
133
class GroupCompressBlock(object):
134
    """An object which maintains the internal structure of the compressed data.
135
136
    This tracks the meta info (start of text, length, type, etc.)
137
    """
138
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
139
    # Group Compress Block v1 Zlib
140
    GCB_HEADER = 'gcb1z\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
141
142
    def __init__(self):
143
        # map by key? or just order in file?
144
        self._entries = {}
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
145
        self._content = None
146
        self._size = 0
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
147
148
    def _parse_header(self):
149
        """Parse the meta-info from the stream."""
150
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
151
    def __len__(self):
152
        return self._size
153
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
154
    @classmethod
155
    def from_bytes(cls, bytes):
156
        out = cls()
157
        if bytes[:6] != cls.GCB_HEADER:
158
            raise gc_errors.InvalidGroupCompressBlock(
159
                'bytes did not start with %r' % (cls.GCB_HEADER,))
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
160
        pos = bytes.index('\n', 6)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
161
        z_header_length = int(bytes[6:pos])
162
        pos += 1
163
        pos2 = bytes.index('\n', pos)
164
        header_length = int(bytes[pos:pos2])
165
        if z_header_length == 0:
166
            assert header_length == 0
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
167
            return out
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
168
        pos = pos2 + 1
169
        pos2 = pos + z_header_length
170
        z_header_bytes = bytes[pos:pos2]
171
        assert len(z_header_bytes) == z_header_length
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
172
        d = zlib.decompressobj()
173
        header_bytes = d.decompress(z_header_bytes)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
174
        assert len(header_bytes) == header_length
175
        del z_header_bytes
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
176
        lines = header_bytes.split('\n')
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
177
        header_len = len(header_bytes)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
178
        del header_bytes
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
179
        info_dict = {}
180
        for line in lines:
181
            if not line: #End of record
182
                if not info_dict:
183
                    break
184
                out.add_entry(**info_dict)
185
                info_dict = {}
186
                continue
187
            key, value = line.split(':', 1)
188
            if key == 'key':
189
                value = tuple(map(intern, value.split('\x00')))
190
            elif key in ('start', 'length'):
191
                value = int(value)
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
192
            elif key == 'type':
193
                value = intern(value)
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
194
            info_dict[key] = value
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
195
        zcontent = bytes[pos2:]
196
        if zcontent:
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
197
            out._content = d.decompress(zcontent)
198
            assert d.flush() == ''
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
199
            out._size = header_len + len(out._content)
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
200
        return out
201
202
    def extract(self, key, sha1=None):
203
        """Extract the text for a specific key.
204
205
        :param key: The label used for this content
206
        :param sha1: TODO (should we validate only when sha1 is supplied?)
207
        :return: The bytes for the content
208
        """
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
209
        entry = self._entries[key]
210
        if entry.type == 'fulltext':
211
            bytes = self._content[entry.start:entry.start + entry.length]
212
        elif entry.type == 'delta':
213
            delta = self._content[entry.start:entry.start + entry.length]
214
            bytes = _groupcompress_pyx.apply_delta(self._content, delta)
215
        # XXX: sha1?
216
        return entry, bytes
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
217
218
    def add_entry(self, key, type, sha1, start, length):
219
        """Add new meta info about an entry.
220
221
        :param key: The key for the new content
222
        :param type: Whether this is a delta or fulltext entry (external?)
223
        :param sha1: sha1sum of the fulltext of this entry
224
        :param start: where the encoded bytes start
225
        :param length: total number of bytes in the encoded form
226
        :return: The entry?
227
        """
228
        entry = GroupCompressBlockEntry(key, type, sha1, start, length)
229
        assert key not in self._entries
230
        self._entries[key] = entry
231
        return entry
232
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
233
    def to_bytes(self, content=''):
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
234
        """Encode the information into a byte stream."""
235
        chunks = []
236
        for key in sorted(self._entries):
237
            entry = self._entries[key]
238
            chunk = ('key:%s\n'
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
239
                     'sha1:%s\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
240
                     'type:%s\n'
241
                     'start:%s\n'
242
                     'length:%s\n'
243
                     '\n'
244
                     ) % ('\x00'.join(entry.key),
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
245
                          entry.sha1,
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
246
                          entry.type,
247
                          entry.start,
248
                          entry.length,
249
                          )
250
            chunks.append(chunk)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
251
        bytes = ''.join(chunks)
252
        info_len = len(bytes)
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
253
        c = zlib.compressobj()
254
        z_bytes = []
255
        z_bytes.append(c.compress(bytes))
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
256
        del bytes
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
257
        # TODO: we may want to have the header compressed in the same chain
258
        #       as the data, or we may not, evaulate it
259
        #       having them compressed together is probably a win for
260
        #       revisions and the 'inv' portion of chk inventories. As the
261
        #       label in the header is duplicated in the text.
262
        #       For chk pages and real bytes, I would guess this is not
263
        #       true.
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
264
        z_bytes.append(c.flush(zlib.Z_SYNC_FLUSH))
265
        z_len = sum(map(len, z_bytes))
266
        c_len = len(content)
267
        z_bytes.append(c.compress(content))
268
        z_bytes.append(c.flush())
269
        chunks = [self.GCB_HEADER,
270
                  '%d\n' % (z_len,),
271
                  '%d\n' % (info_len,),
272
                  #'%d\n' % (c_len,),
273
                 ]
274
        chunks.extend(z_bytes)
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
275
        return ''.join(chunks)
276
277
0.17.2 by Robert Collins
Core proof of concept working.
278
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
279
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
280
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
281
    It contains code very similar to SequenceMatcher because of having a similar
282
    task. However some key differences apply:
283
     - there is no junk, we want a minimal edit not a human readable diff.
284
     - we don't filter very common lines (because we don't know where a good
285
       range will start, and after the first text we want to be emitting minmal
286
       edits only.
287
     - we chain the left side, not the right side
288
     - we incrementally update the adjacency matrix as new lines are provided.
289
     - we look for matches in all of the left side, so the routine which does
290
       the analagous task of find_longest_match does not need to filter on the
291
       left side.
292
    """
0.17.2 by Robert Collins
Core proof of concept working.
293
294
    def __init__(self, delta=True):
295
        """Create a GroupCompressor.
296
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
297
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
298
        """
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
299
        # Consider seeding the lines with some sort of GC Start flag, or
300
        # putting it as part of the output stream, rather than in the
301
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
302
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
303
        self.endpoint = 0
304
        self.input_bytes = 0
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
305
        self.num_keys = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
306
        self.labels_deltas = {}
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
307
        self._last = None
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
308
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
309
        self._block = GroupCompressBlock()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
310
0.23.58 by John Arbash Meinel
fix up the failing tests.
311
    def compress(self, key, bytes, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
312
        """Compress lines with label key.
313
314
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
315
            for identification of the text during decompression. If the last
316
            element is 'None' it is replaced with the sha1 of the text -
317
            e.g. sha1:xxxxxxx.
0.23.58 by John Arbash Meinel
fix up the failing tests.
318
        :param bytes: The bytes to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
319
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
320
            have. During compression the sha is calculated; a mismatch will
321
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
322
        :param soft: Do a 'soft' compression. This means that we require larger
323
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
324
        :return: The sha1 of lines, and the number of bytes accumulated in
325
            the group output so far.
326
        """
0.23.54 by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s.
327
        if not _FAST or expected_sha is None:
328
            sha1 = sha_string(bytes)
329
        else:
330
            sha1 = expected_sha
0.17.26 by Robert Collins
Working better --gc-plain-chk.
331
        if key[-1] is None:
332
            key = key[:-1] + ('sha1:' + sha1,)
0.23.52 by John Arbash Meinel
Use the max_delta flag.
333
        input_len = len(bytes)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
334
        # By having action/label/sha1/len, we can parse the group if the index
335
        # was ever destroyed, we have the key in 'label', we know the final
336
        # bytes are valid from sha1, and we know where to find the end of this
337
        # record because of 'len'. (the delta record itself will store the
338
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
339
        # 'len: %d\n' costs approximately 1% increase in total data
340
        # Having the labels at all costs us 9-10% increase, 38% increase for
341
        # inventory pages, and 5.8% increase for text pages
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
342
        # new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
343
        if self._delta_index._source_offset != self.endpoint:
344
            raise AssertionError('_source_offset != endpoint'
345
                ' somehow the DeltaIndex got out of sync with'
346
                ' the output lines')
0.23.52 by John Arbash Meinel
Use the max_delta flag.
347
        max_delta_size = len(bytes) / 2
348
        delta = self._delta_index.make_delta(bytes, max_delta_size)
349
        if (delta is None):
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
350
            type = 'fulltext'
351
            length = len(bytes)
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
352
            self._delta_index.add_source(bytes, 0)
353
            new_chunks = [bytes]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
354
        else:
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
355
            type = 'delta'
356
            length = len(delta)
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
357
            new_chunks = [delta]
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
358
            if _FAST:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
359
                self._delta_index._source_offset += len(delta)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
360
            else:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
361
                self._delta_index.add_delta_source(delta, 0)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
362
        self._block.add_entry(key, type=type, sha1=sha1,
363
                              start=self.endpoint, length=length)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
364
        delta_start = (self.endpoint, len(self.lines))
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
365
        self.num_keys += 1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
366
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
367
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
368
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
369
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
370
        if not self._delta_index._source_offset == self.endpoint:
371
            raise AssertionError('the delta index is out of sync'
372
                'with the output lines %s != %s'
373
                % (self._delta_index._source_offset, self.endpoint))
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
374
        return sha1, self.endpoint, type, length
0.17.2 by Robert Collins
Core proof of concept working.
375
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
376
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
377
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
378
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
379
        :param key: The key to extract.
380
        :return: An iterable over bytes and the sha1.
381
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
382
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
383
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
384
        stored_bytes = ''.join(delta_chunks)
385
        # TODO: Fix this, we shouldn't really be peeking here
386
        entry = self._block._entries[key]
387
        if entry.type == 'fulltext':
388
            bytes = stored_bytes
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
389
        else:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
390
            assert entry.type == 'delta'
391
            # XXX: This is inefficient at best
392
            source = ''.join(self.lines)
0.25.8 by John Arbash Meinel
Fix up the tests. Mostly it was just changing things to
393
            bytes = _groupcompress_pyx.apply_delta(source, stored_bytes)
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
394
            assert entry.sha1 == sha_string(bytes)
395
        return bytes, entry.sha1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
396
397
    def output_chunks(self, new_chunks):
398
        """Output some chunks.
399
400
        :param new_chunks: The chunks to output.
401
        """
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
402
        self._last = (len(self.lines), self.endpoint)
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
403
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
404
        self.lines.extend(new_chunks)
405
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
406
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
407
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
408
    def pop_last(self):
409
        """Call this if you want to 'revoke' the last compression.
410
411
        After this, the data structures will be rolled back, but you cannot do
412
        more compression.
413
        """
414
        self._delta_index = None
415
        del self.lines[self._last[0]:]
416
        self.endpoint = self._last[1]
417
        self._last = None
418
0.17.2 by Robert Collins
Core proof of concept working.
419
    def ratio(self):
420
        """Return the overall compression ratio."""
421
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
422
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
423
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
424
def make_pack_factory(graph, delta, keylength):
425
    """Create a factory for creating a pack based groupcompress.
426
427
    This is only functional enough to run interface tests, it doesn't try to
428
    provide a full pack environment.
429
    
430
    :param graph: Store a graph.
431
    :param delta: Delta compress contents.
432
    :param keylength: How long should keys be.
433
    """
434
    def factory(transport):
435
        parents = graph or delta
436
        ref_length = 0
437
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
438
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
439
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
440
            key_elements=keylength)
441
        stream = transport.open_write_stream('newpack')
442
        writer = pack.ContainerWriter(stream.write)
443
        writer.begin()
444
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
445
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
446
        access = _DirectPackAccess({})
447
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
448
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
449
        result.stream = stream
450
        result.writer = writer
451
        return result
452
    return factory
453
454
455
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
456
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
457
    versioned_files.stream.close()
458
459
460
class GroupCompressVersionedFiles(VersionedFiles):
461
    """A group-compress based VersionedFiles implementation."""
462
0.17.2 by Robert Collins
Core proof of concept working.
463
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
464
        """Create a GroupCompressVersionedFiles object.
465
466
        :param index: The index object storing access and graph data.
467
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
468
        :param delta: Whether to delta compress or just entropy compress.
469
        """
470
        self._index = index
471
        self._access = access
472
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
473
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
474
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
475
476
    def add_lines(self, key, parents, lines, parent_texts=None,
477
        left_matching_blocks=None, nostore_sha=None, random_id=False,
478
        check_content=True):
479
        """Add a text to the store.
480
481
        :param key: The key tuple of the text to add.
482
        :param parents: The parents key tuples of the text to add.
483
        :param lines: A list of lines. Each line must be a bytestring. And all
484
            of them except the last must be terminated with \n and contain no
485
            other \n's. The last line may either contain no \n's or a single
486
            terminating \n. If the lines list does meet this constraint the add
487
            routine may error or may succeed - but you will be unable to read
488
            the data back accurately. (Checking the lines have been split
489
            correctly is expensive and extremely unlikely to catch bugs so it
490
            is not done at runtime unless check_content is True.)
491
        :param parent_texts: An optional dictionary containing the opaque 
492
            representations of some or all of the parents of version_id to
493
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
494
            returned by add_lines or data corruption can be caused.
495
        :param left_matching_blocks: a hint about which areas are common
496
            between the text and its left-hand-parent.  The format is
497
            the SequenceMatcher.get_matching_blocks format.
498
        :param nostore_sha: Raise ExistingContent and do not add the lines to
499
            the versioned file if the digest of the lines matches this.
500
        :param random_id: If True a random id has been selected rather than
501
            an id determined by some deterministic process such as a converter
502
            from a foreign VCS. When True the backend may choose not to check
503
            for uniqueness of the resulting key within the versioned file, so
504
            this should only be done when the result is expected to be unique
505
            anyway.
506
        :param check_content: If True, the lines supplied are verified to be
507
            bytestrings that are correctly formed lines.
508
        :return: The text sha1, the number of bytes in the text, and an opaque
509
                 representation of the inserted version which can be provided
510
                 back to future add_lines calls in the parent_texts dictionary.
511
        """
512
        self._index._check_write_ok()
513
        self._check_add(key, lines, random_id, check_content)
514
        if parents is None:
515
            # The caller might pass None if there is no graph data, but kndx
516
            # indexes can't directly store that, so we give them
517
            # an empty tuple instead.
518
            parents = ()
519
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
520
        length = sum(map(len, lines))
521
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
522
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
523
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
524
0.17.4 by Robert Collins
Annotate.
525
    def annotate(self, key):
526
        """See VersionedFiles.annotate."""
527
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
528
        parent_map = self.get_parent_map([key])
529
        if not parent_map:
530
            raise errors.RevisionNotPresent(key, self)
531
        if parent_map[key] is not None:
532
            search = graph._make_breadth_first_searcher([key])
533
            keys = set()
534
            while True:
535
                try:
536
                    present, ghosts = search.next_with_ghosts()
537
                except StopIteration:
538
                    break
539
                keys.update(present)
540
            parent_map = self.get_parent_map(keys)
541
        else:
542
            keys = [key]
543
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
544
        head_cache = _mod_graph.FrozenHeadsCache(graph)
545
        parent_cache = {}
546
        reannotate = annotate.reannotate
547
        for record in self.get_record_stream(keys, 'topological', True):
548
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
549
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
550
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
551
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
552
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
553
        return parent_cache[key]
554
0.17.5 by Robert Collins
nograph tests completely passing.
555
    def check(self, progress_bar=None):
556
        """See VersionedFiles.check()."""
557
        keys = self.keys()
558
        for record in self.get_record_stream(keys, 'unordered', True):
559
            record.get_bytes_as('fulltext')
560
0.17.2 by Robert Collins
Core proof of concept working.
561
    def _check_add(self, key, lines, random_id, check_content):
562
        """check that version_id and lines are safe to add."""
563
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
564
        if version_id is not None:
565
            if contains_whitespace(version_id):
566
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
567
        self.check_not_reserved_id(version_id)
568
        # TODO: If random_id==False and the key is already present, we should
569
        # probably check that the existing content is identical to what is
570
        # being inserted, and otherwise raise an exception.  This would make
571
        # the bundle code simpler.
572
        if check_content:
573
            self._check_lines_not_unicode(lines)
574
            self._check_lines_are_lines(lines)
575
0.17.5 by Robert Collins
nograph tests completely passing.
576
    def get_parent_map(self, keys):
577
        """Get a map of the parents of keys.
578
579
        :param keys: The keys to look up parents for.
580
        :return: A mapping from keys to parents. Absent keys are absent from
581
            the mapping.
582
        """
583
        result = {}
584
        sources = [self._index]
585
        source_results = []
586
        missing = set(keys)
587
        for source in sources:
588
            if not missing:
589
                break
590
            new_result = source.get_parent_map(missing)
591
            source_results.append(new_result)
592
            result.update(new_result)
593
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
594
        if self._unadded_refs:
595
            for key in missing:
596
                if key in self._unadded_refs:
597
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
598
        return result
599
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
600
    def _get_block(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
601
        read_memo = index_memo[0:3]
602
        # get the group:
603
        try:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
604
            block = self._group_cache[read_memo]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
605
        except KeyError:
606
            # read the group
607
            zdata = self._access.get_raw_records([read_memo]).next()
608
            # decompress - whole thing - this is not a bug, as it
609
            # permits caching. We might want to store the partially
610
            # decompresed group and decompress object, so that recent
611
            # texts are not penalised by big groups.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
612
            block = GroupCompressBlock.from_bytes(zdata)
613
            self._group_cache[read_memo] = block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
614
        # cheapo debugging:
615
        # print len(zdata), len(plain)
616
        # parse - requires split_lines, better to have byte offsets
617
        # here (but not by much - we only split the region for the
618
        # recipe, and we often want to end up with lines anyway.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
619
        return block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
620
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
621
    def get_missing_compression_parent_keys(self):
622
        """Return the keys of missing compression parents.
623
624
        Missing compression parents occur when a record stream was missing
625
        basis texts, or a index was scanned that had missing basis texts.
626
        """
627
        # GroupCompress cannot currently reference texts that are not in the
628
        # group, so this is valid for now
629
        return frozenset()
630
0.17.5 by Robert Collins
nograph tests completely passing.
631
    def get_record_stream(self, keys, ordering, include_delta_closure):
632
        """Get a stream of records for keys.
633
634
        :param keys: The keys to include.
635
        :param ordering: Either 'unordered' or 'topological'. A topologically
636
            sorted stream has compression parents strictly before their
637
            children.
638
        :param include_delta_closure: If True then the closure across any
639
            compression parents will be included (in the opaque data).
640
        :return: An iterator of ContentFactory objects, each of which is only
641
            valid until the iterator is advanced.
642
        """
643
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
644
        orig_keys = list(keys)
645
        keys = set(orig_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
646
        if not keys:
647
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
648
        if (not self._index.has_graph
649
            and ordering in ('topological', 'gc-optimal')):
0.17.5 by Robert Collins
nograph tests completely passing.
650
            # Cannot topological order when no graph has been stored.
651
            ordering = 'unordered'
652
        # Cheap: iterate
653
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
654
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
0.17.5 by Robert Collins
nograph tests completely passing.
655
        if ordering == 'topological':
656
            # would be better to not globally sort initially but instead
657
            # start with one key, recurse to its oldest parent, then grab
658
            # everything in the same group, etc.
659
            parent_map = dict((key, details[2]) for key, details in
660
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
661
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
662
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
663
            present_keys = topo_sort(parent_map)
664
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
665
        elif ordering == 'gc-optimal':
666
            parent_map = dict((key, details[2]) for key, details in
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
667
                              locations.iteritems())
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
668
            for key in local_keys:
669
                parent_map[key] = self._unadded_refs[key]
670
            # XXX: This only optimizes for the target ordering. We may need to
671
            #      balance that with the time it takes to extract ordering, by
672
            #      somehow grouping based on locations[key][0:3]
673
            present_keys = sort_gc_optimal(parent_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
674
        elif ordering == 'as-requested':
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
675
            present_keys = [key for key in orig_keys if key in locations
676
                            or key in local_keys]
0.17.5 by Robert Collins
nograph tests completely passing.
677
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
678
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
679
            # Otherwise we thrash the _group_cache and destroy performance
680
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
681
                # This is the group the bytes are stored in, followed by the
682
                # location in the group
683
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
684
            present_keys = sorted(locations.iterkeys(), key=get_group)
685
            # We don't have an ordering for keys in the in-memory object, but
686
            # lets process the in-memory ones first.
687
            present_keys = list(local_keys) + present_keys
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
688
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
689
        absent_keys = keys.difference(set(locations))
690
        for key in absent_keys:
691
            yield AbsentContentFactory(key)
692
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
693
            if key in self._unadded_refs:
0.20.28 by John Arbash Meinel
Fix typo with the recent lines => chunks rename.
694
                chunks, sha1 = self._compressor.extract(key)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
695
                parents = self._unadded_refs[key]
696
            else:
697
                index_memo, _, parents, (method, _) = locations[key]
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
698
                block = self._get_block(index_memo)
699
                entry, bytes = block.extract(key)
700
                sha1 = entry.sha1
701
                if not _FAST and sha_string(bytes) != sha1:
702
                    raise AssertionError('sha1 sum did not match')
703
            yield FulltextContentFactory(key, parents, sha1, bytes)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
704
0.17.5 by Robert Collins
nograph tests completely passing.
705
    def get_sha1s(self, keys):
706
        """See VersionedFiles.get_sha1s()."""
707
        result = {}
708
        for record in self.get_record_stream(keys, 'unordered', True):
709
            if record.sha1 != None:
710
                result[record.key] = record.sha1
711
            else:
712
                if record.storage_kind != 'absent':
713
                    result[record.key] == sha_string(record.get_bytes_as(
714
                        'fulltext'))
715
        return result
716
0.17.2 by Robert Collins
Core proof of concept working.
717
    def insert_record_stream(self, stream):
718
        """Insert a record stream into this container.
719
720
        :param stream: A stream of records to insert. 
721
        :return: None
722
        :seealso VersionedFiles.get_record_stream:
723
        """
0.17.5 by Robert Collins
nograph tests completely passing.
724
        for _ in self._insert_record_stream(stream):
725
            pass
0.17.2 by Robert Collins
Core proof of concept working.
726
0.17.5 by Robert Collins
nograph tests completely passing.
727
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
728
        """Internal core to insert a record stream into this container.
729
730
        This helper function has a different interface than insert_record_stream
731
        to allow add_lines to be minimal, but still return the needed data.
732
733
        :param stream: A stream of records to insert. 
734
        :return: An iterator over the sha1 of the inserted records.
735
        :seealso insert_record_stream:
736
        :seealso add_lines:
737
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
738
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
739
        def get_adapter(adapter_key):
740
            try:
741
                return adapters[adapter_key]
742
            except KeyError:
743
                adapter_factory = adapter_registry.get(adapter_key)
744
                adapter = adapter_factory(self)
745
                adapters[adapter_key] = adapter
746
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
747
        # This will go up to fulltexts for gc to gc fetching, which isn't
748
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
749
        self._compressor = GroupCompressor(self._delta)
750
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
751
        keys_to_add = []
752
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
753
        def flush():
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
754
            bytes = self._compressor._block.to_bytes(
755
                ''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
756
            index, start, length = self._access.add_raw_records(
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
757
                [(None, len(bytes))], bytes)[0]
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
758
            nodes = []
759
            for key, reads, refs in keys_to_add:
760
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
761
            self._index.add_records(nodes, random_id=random_id)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
762
            self._unadded_refs = {}
763
            del keys_to_add[:]
764
            self._compressor = GroupCompressor(self._delta)
765
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
766
        last_prefix = None
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
767
        last_fulltext_len = None
768
        max_fulltext_len = 0
0.17.2 by Robert Collins
Core proof of concept working.
769
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
770
            # Raise an error when a record is missing.
771
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
772
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
773
            try:
0.23.52 by John Arbash Meinel
Use the max_delta flag.
774
                bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
775
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
776
                adapter_key = record.storage_kind, 'fulltext'
777
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
778
                bytes = adapter.get_bytes(record)
0.20.13 by John Arbash Meinel
Play around a bit.
779
            if len(record.key) > 1:
780
                prefix = record.key[0]
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
781
            else:
782
                prefix = None
783
            max_fulltext_len = max(max_fulltext_len, len(bytes))
784
            (found_sha1, end_point, type,
785
             length) = self._compressor.compress(record.key,
786
                bytes, record.sha1)
787
            # Check if we want to continue to include that text
788
            start_new_block = False
789
            if end_point > 2 * max_fulltext_len:
790
                if end_point > 4*1024*1024:
791
                    start_new_block = True
792
                elif (prefix is not None and prefix != last_prefix
793
                      and end_point > 2*1024*1024):
794
                    start_new_block = True
795
            # if type == 'fulltext':
796
            #     # If this is the first text, we don't do anything
797
            #     if self._compressor.num_keys > 1:
798
            #         if prefix is not None and prefix != last_prefix:
799
            #             # We just inserted a fulltext for a different prefix
800
            #             # (aka file-id).
801
            #             if end_point > 512 * 1024:
802
            #                 start_new_block = True
803
            #             # TODO: Consider packing several small texts together
804
            #             #       maybe only flush if end_point > some threshold
805
            #             # if end_point > 512 * 1024 or len(bytes) <
806
            #             #     start_new_block = true
807
            #         else:
808
            #             # We just added a fulltext, part of the same file-id
809
            #             if (end_point > 2*1024*1024
810
            #                 and end_point > 5*max_fulltext_len):
811
            #                 start_new_block = True
812
            #     last_fulltext_len = len(bytes)
813
            # else:
814
            #     delta_ratio = float(len(bytes)) / length
815
            #     if delta_ratio < 3: # Not much compression
816
            #         if end_point > 1*1024*1024:
817
            #             start_new_block = True
818
            #     elif delta_ratio < 10: # 10:1 compression
819
            #         if end_point > 4*1024*1024:
820
            #             start_new_block = True
821
            last_prefix = prefix
822
            if start_new_block:
823
                self._compressor.pop_last()
824
                flush()
825
                basis_end = 0
826
                max_fulltext_len = len(bytes)
827
                (found_sha1, end_point, type,
828
                 length) = self._compressor.compress(record.key,
829
                    bytes, record.sha1)
830
                assert type == 'fulltext'
831
                last_fulltext_len = length
0.17.26 by Robert Collins
Working better --gc-plain-chk.
832
            if record.key[-1] is None:
833
                key = record.key[:-1] + ('sha1:' + found_sha1,)
834
            else:
835
                key = record.key
836
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
837
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
838
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
839
                (record.parents,)))
840
            basis_end = end_point
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
841
        if len(keys_to_add):
842
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
843
        self._compressor = None
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
844
        assert self._unadded_refs == {}
0.17.5 by Robert Collins
nograph tests completely passing.
845
846
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
847
        """Iterate over the lines in the versioned files from keys.
848
849
        This may return lines from other keys. Each item the returned
850
        iterator yields is a tuple of a line and a text version that that line
851
        is present in (not introduced in).
852
853
        Ordering of results is in whatever order is most suitable for the
854
        underlying storage format.
855
856
        If a progress bar is supplied, it may be used to indicate progress.
857
        The caller is responsible for cleaning up progress bars (because this
858
        is an iterator).
859
860
        NOTES:
861
         * Lines are normalised by the underlying store: they will all have \n
862
           terminators.
863
         * Lines are returned in arbitrary order.
864
865
        :return: An iterator over (line, key).
866
        """
867
        if pb is None:
868
            pb = progress.DummyProgress()
869
        keys = set(keys)
870
        total = len(keys)
871
        # we don't care about inclusions, the caller cares.
872
        # but we need to setup a list of records to visit.
873
        # we need key, position, length
874
        for key_idx, record in enumerate(self.get_record_stream(keys,
875
            'unordered', True)):
876
            # XXX: todo - optimise to use less than full texts.
877
            key = record.key
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
878
            pb.update('Walking content.', key_idx, total)
0.17.5 by Robert Collins
nograph tests completely passing.
879
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
880
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
881
            lines = split_lines(record.get_bytes_as('fulltext'))
882
            for line in lines:
883
                yield line, key
884
        pb.update('Walking content.', total, total)
885
886
    def keys(self):
887
        """See VersionedFiles.keys."""
888
        if 'evil' in debug.debug_flags:
889
            trace.mutter_callsite(2, "keys scales with size of history")
890
        sources = [self._index]
891
        result = set()
892
        for source in sources:
893
            result.update(source.keys())
894
        return result
895
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
896
897
class _GCGraphIndex(object):
898
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
899
0.17.9 by Robert Collins
Initial stab at repository format support.
900
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
901
        add_callback=None):
902
        """Construct a _GCGraphIndex on a graph_index.
903
904
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
905
        :param is_locked: A callback, returns True if the index is locked and
906
            thus usable.
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
907
        :param parents: If True, record knits parents, if not do not record 
908
            parents.
909
        :param add_callback: If not None, allow additions to the index and call
910
            this callback with a list of added GraphIndex nodes:
911
            [(node, value, node_refs), ...]
912
        """
913
        self._add_callback = add_callback
914
        self._graph_index = graph_index
915
        self._parents = parents
916
        self.has_graph = parents
917
        self._is_locked = is_locked
918
0.17.5 by Robert Collins
nograph tests completely passing.
919
    def add_records(self, records, random_id=False):
920
        """Add multiple records to the index.
921
        
922
        This function does not insert data into the Immutable GraphIndex
923
        backing the KnitGraphIndex, instead it prepares data for insertion by
924
        the caller and checks that it is safe to insert then calls
925
        self._add_callback with the prepared GraphIndex nodes.
926
927
        :param records: a list of tuples:
928
                         (key, options, access_memo, parents).
929
        :param random_id: If True the ids being added were randomly generated
930
            and no check for existence will be performed.
931
        """
932
        if not self._add_callback:
933
            raise errors.ReadOnlyError(self)
934
        # we hope there are no repositories with inconsistent parentage
935
        # anymore.
936
937
        changed = False
938
        keys = {}
939
        for (key, value, refs) in records:
940
            if not self._parents:
941
                if refs:
942
                    for ref in refs:
943
                        if ref:
944
                            raise KnitCorrupt(self,
945
                                "attempt to add node with parents "
946
                                "in parentless index.")
947
                    refs = ()
948
                    changed = True
949
            keys[key] = (value, refs)
950
        # check for dups
951
        if not random_id:
952
            present_nodes = self._get_entries(keys)
953
            for (index, key, value, node_refs) in present_nodes:
954
                if node_refs != keys[key][1]:
955
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
956
                        ": %s %s" % ((value, node_refs), keys[key]))
957
                del keys[key]
958
                changed = True
959
        if changed:
960
            result = []
961
            if self._parents:
962
                for key, (value, node_refs) in keys.iteritems():
963
                    result.append((key, value, node_refs))
964
            else:
965
                for key, (value, node_refs) in keys.iteritems():
966
                    result.append((key, value))
967
            records = result
968
        self._add_callback(records)
969
        
970
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
971
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
972
        if not self._is_locked():
973
            raise errors.ObjectNotLocked(self)
974
0.17.2 by Robert Collins
Core proof of concept working.
975
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
976
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
977
        if not self._is_locked():
978
            raise errors.ObjectNotLocked(self)
979
0.17.5 by Robert Collins
nograph tests completely passing.
980
    def _get_entries(self, keys, check_present=False):
981
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
982
983
        Note: Callers are responsible for checking that the index is locked
984
        before calling this method.
985
0.17.5 by Robert Collins
nograph tests completely passing.
986
        :param keys: An iterable of index key tuples.
987
        """
988
        keys = set(keys)
989
        found_keys = set()
990
        if self._parents:
991
            for node in self._graph_index.iter_entries(keys):
992
                yield node
993
                found_keys.add(node[1])
994
        else:
995
            # adapt parentless index to the rest of the code.
996
            for node in self._graph_index.iter_entries(keys):
997
                yield node[0], node[1], node[2], ()
998
                found_keys.add(node[1])
999
        if check_present:
1000
            missing_keys = keys.difference(found_keys)
1001
            if missing_keys:
1002
                raise RevisionNotPresent(missing_keys.pop(), self)
1003
1004
    def get_parent_map(self, keys):
1005
        """Get a map of the parents of keys.
1006
1007
        :param keys: The keys to look up parents for.
1008
        :return: A mapping from keys to parents. Absent keys are absent from
1009
            the mapping.
1010
        """
1011
        self._check_read()
1012
        nodes = self._get_entries(keys)
1013
        result = {}
1014
        if self._parents:
1015
            for node in nodes:
1016
                result[node[1]] = node[3][0]
1017
        else:
1018
            for node in nodes:
1019
                result[node[1]] = None
1020
        return result
1021
1022
    def get_build_details(self, keys):
1023
        """Get the various build details for keys.
1024
1025
        Ghosts are omitted from the result.
1026
1027
        :param keys: An iterable of keys.
1028
        :return: A dict of key:
1029
            (index_memo, compression_parent, parents, record_details).
1030
            index_memo
1031
                opaque structure to pass to read_records to extract the raw
1032
                data
1033
            compression_parent
1034
                Content that this record is built upon, may be None
1035
            parents
1036
                Logical parents of this node
1037
            record_details
1038
                extra information about the content which needs to be passed to
1039
                Factory.parse_record
1040
        """
1041
        self._check_read()
1042
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1043
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
1044
        for entry in entries:
1045
            key = entry[1]
1046
            if not self._parents:
1047
                parents = None
1048
            else:
1049
                parents = entry[3][0]
1050
            method = 'group'
1051
            result[key] = (self._node_to_position(entry),
1052
                                  None, parents, (method, None))
1053
        return result
1054
    
1055
    def keys(self):
1056
        """Get all the keys in the collection.
1057
        
1058
        The keys are not ordered.
1059
        """
1060
        self._check_read()
1061
        return [node[1] for node in self._graph_index.iter_all_entries()]
1062
    
1063
    def _node_to_position(self, node):
1064
        """Convert an index value to position details."""
1065
        bits = node[2].split(' ')
1066
        # It would be nice not to read the entire gzip.
1067
        start = int(bits[0])
1068
        stop = int(bits[1])
1069
        basis_end = int(bits[2])
1070
        delta_end = int(bits[3])
1071
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1072
1073
1074
try:
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
1075
    from bzrlib.plugins.groupcompress import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1076
except ImportError:
1077
    pass