3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1 |
# Copyright (C) 2008, 2009 Canonical Ltd
|
2 |
#
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
3 |
# This program is free software; you can redistribute it and/or modify
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
4 |
# it under the terms of the GNU General Public License as published by
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
8 |
# This program is distributed in the hope that it will be useful,
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
12 |
#
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
13 |
# You should have received a copy of the GNU General Public License
|
14 |
# along with this program; if not, write to the Free Software
|
|
3735.36.3
by John Arbash Meinel
Add the new address for FSF to the new files. |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
16 |
|
17 |
"""Core compression logic for compressing streams of related files."""
|
|
18 |
||
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
19 |
import time |
0.17.5
by Robert Collins
nograph tests completely passing. |
20 |
import zlib |
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
21 |
try: |
22 |
import pylzma |
|
23 |
except ImportError: |
|
24 |
pylzma = None |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
25 |
|
0.17.4
by Robert Collins
Annotate. |
26 |
from bzrlib import ( |
27 |
annotate, |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
28 |
debug, |
29 |
errors, |
|
0.17.4
by Robert Collins
Annotate. |
30 |
graph as _mod_graph, |
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
31 |
knit, |
0.20.2
by John Arbash Meinel
Teach groupcompress about 'chunked' encoding |
32 |
osutils, |
0.17.4
by Robert Collins
Annotate. |
33 |
pack, |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
34 |
trace, |
0.17.4
by Robert Collins
Annotate. |
35 |
)
|
0.17.21
by Robert Collins
Update groupcompress to bzrlib 1.10. |
36 |
from bzrlib.btree_index import BTreeBuilder |
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
37 |
from bzrlib.lru_cache import LRUSizeCache |
0.17.9
by Robert Collins
Initial stab at repository format support. |
38 |
from bzrlib.tsort import topo_sort |
0.17.2
by Robert Collins
Core proof of concept working. |
39 |
from bzrlib.versionedfile import ( |
0.17.5
by Robert Collins
nograph tests completely passing. |
40 |
adapter_registry, |
41 |
AbsentContentFactory, |
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
42 |
ChunkedContentFactory, |
0.17.2
by Robert Collins
Core proof of concept working. |
43 |
FulltextContentFactory, |
44 |
VersionedFiles, |
|
45 |
)
|
|
46 |
||
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
47 |
# Minimum number of uncompressed bytes to try fetch at once when retrieving
|
48 |
# groupcompress blocks.
|
|
49 |
BATCH_SIZE = 2**16 |
|
50 |
||
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
51 |
_USE_LZMA = False and (pylzma is not None) |
0.17.2
by Robert Collins
Core proof of concept working. |
52 |
|
3735.2.162
by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point. |
53 |
# osutils.sha_string('')
|
54 |
_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' |
|
55 |
||
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
56 |
def sort_gc_optimal(parent_map): |
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
57 |
"""Sort and group the keys in parent_map into groupcompress order.
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
58 |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
59 |
groupcompress is defined (currently) as reverse-topological order, grouped
|
60 |
by the key prefix.
|
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
61 |
|
62 |
:return: A sorted-list of keys
|
|
63 |
"""
|
|
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
64 |
# groupcompress ordering is approximately reverse topological,
|
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
65 |
# properly grouped by file-id.
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
66 |
per_prefix_map = {} |
4593.5.43
by John Arbash Meinel
The api for topo_sort() was to allow a list of (key, value) |
67 |
for key, value in parent_map.iteritems(): |
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
68 |
if isinstance(key, str) or len(key) == 1: |
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
69 |
prefix = '' |
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
70 |
else: |
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
71 |
prefix = key[0] |
72 |
try: |
|
4593.5.43
by John Arbash Meinel
The api for topo_sort() was to allow a list of (key, value) |
73 |
per_prefix_map[prefix][key] = value |
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
74 |
except KeyError: |
4593.5.43
by John Arbash Meinel
The api for topo_sort() was to allow a list of (key, value) |
75 |
per_prefix_map[prefix] = {key: value} |
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
76 |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
77 |
present_keys = [] |
0.20.11
by John Arbash Meinel
start experimenting with gc-optimal ordering. |
78 |
for prefix in sorted(per_prefix_map): |
79 |
present_keys.extend(reversed(topo_sort(per_prefix_map[prefix]))) |
|
80 |
return present_keys |
|
81 |
||
82 |
||
3735.32.9
by John Arbash Meinel
Use a 32kB extension, since that is the max window size for zlib. |
83 |
# The max zlib window size is 32kB, so if we set 'max_size' output of the
|
84 |
# decompressor to the requested bytes + 32kB, then we should guarantee
|
|
85 |
# num_bytes coming out.
|
|
86 |
_ZLIB_DECOMP_WINDOW = 32*1024 |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
87 |
|
88 |
class GroupCompressBlock(object): |
|
89 |
"""An object which maintains the internal structure of the compressed data.
|
|
90 |
||
91 |
This tracks the meta info (start of text, length, type, etc.)
|
|
92 |
"""
|
|
93 |
||
0.25.5
by John Arbash Meinel
Now using a zlib compressed format. |
94 |
# Group Compress Block v1 Zlib
|
95 |
GCB_HEADER = 'gcb1z\n' |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
96 |
# Group Compress Block v1 Lzma
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
97 |
GCB_LZ_HEADER = 'gcb1l\n' |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
98 |
GCB_KNOWN_HEADERS = (GCB_HEADER, GCB_LZ_HEADER) |
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
99 |
|
100 |
def __init__(self): |
|
101 |
# map by key? or just order in file?
|
|
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
102 |
self._compressor_name = None |
3735.32.5
by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes. |
103 |
self._z_content = None |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
104 |
self._z_content_decompressor = None |
3735.32.5
by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes. |
105 |
self._z_content_length = None |
106 |
self._content_length = None |
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
107 |
self._content = None |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
108 |
self._content_chunks = None |
3735.32.5
by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes. |
109 |
|
110 |
def __len__(self): |
|
3735.38.4
by John Arbash Meinel
Another disk format change. |
111 |
# This is the maximum number of bytes this object will reference if
|
112 |
# everything is decompressed. However, if we decompress less than
|
|
113 |
# everything... (this would cause some problems for LRUSizeCache)
|
|
114 |
return self._content_length + self._z_content_length |
|
0.17.48
by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header. |
115 |
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
116 |
def _ensure_content(self, num_bytes=None): |
117 |
"""Make sure that content has been expanded enough.
|
|
118 |
||
119 |
:param num_bytes: Ensure that we have extracted at least num_bytes of
|
|
120 |
content. If None, consume everything
|
|
121 |
"""
|
|
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
122 |
# TODO: If we re-use the same content block at different times during
|
123 |
# get_record_stream(), it is possible that the first pass will
|
|
124 |
# get inserted, triggering an extract/_ensure_content() which
|
|
125 |
# will get rid of _z_content. And then the next use of the block
|
|
126 |
# will try to access _z_content (to send it over the wire), and
|
|
127 |
# fail because it is already extracted. Consider never releasing
|
|
128 |
# _z_content because of this.
|
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
129 |
if num_bytes is None: |
130 |
num_bytes = self._content_length |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
131 |
elif (self._content_length is not None |
132 |
and num_bytes > self._content_length): |
|
133 |
raise AssertionError( |
|
134 |
'requested num_bytes (%d) > content length (%d)' |
|
135 |
% (num_bytes, self._content_length)) |
|
136 |
# Expand the content if required
|
|
3735.32.6
by John Arbash Meinel
A bit of reworking changes things so content is expanded at extract() time. |
137 |
if self._content is None: |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
138 |
if self._content_chunks is not None: |
139 |
self._content = ''.join(self._content_chunks) |
|
140 |
self._content_chunks = None |
|
141 |
if self._content is None: |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
142 |
if self._z_content is None: |
143 |
raise AssertionError('No content to decompress') |
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
144 |
if self._z_content == '': |
145 |
self._content = '' |
|
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
146 |
elif self._compressor_name == 'lzma': |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
147 |
# We don't do partial lzma decomp yet
|
3735.2.160
by John Arbash Meinel
Fix a trivial typo |
148 |
self._content = pylzma.decompress(self._z_content) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
149 |
elif self._compressor_name == 'zlib': |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
150 |
# Start a zlib decompressor
|
3735.32.27
by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds. |
151 |
if num_bytes is None: |
152 |
self._content = zlib.decompress(self._z_content) |
|
153 |
else: |
|
154 |
self._z_content_decompressor = zlib.decompressobj() |
|
155 |
# Seed the decompressor with the uncompressed bytes, so
|
|
156 |
# that the rest of the code is simplified
|
|
157 |
self._content = self._z_content_decompressor.decompress( |
|
158 |
self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
159 |
else: |
3735.2.182
by Matt Nordhoff
Improve an assertion message slightly, and fix typos in 2 others |
160 |
raise AssertionError('Unknown compressor: %r' |
3735.2.183
by John Arbash Meinel
Fix the compressor name. |
161 |
% self._compressor_name) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
162 |
# Any bytes remaining to be decompressed will be in the decompressors
|
163 |
# 'unconsumed_tail'
|
|
164 |
||
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
165 |
# Do we have enough bytes already?
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
166 |
if num_bytes is not None and len(self._content) >= num_bytes: |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
167 |
return
|
3735.32.27
by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds. |
168 |
if num_bytes is None and self._z_content_decompressor is None: |
169 |
# We must have already decompressed everything
|
|
170 |
return
|
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
171 |
# If we got this far, and don't have a decompressor, something is wrong
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
172 |
if self._z_content_decompressor is None: |
173 |
raise AssertionError( |
|
3735.2.182
by Matt Nordhoff
Improve an assertion message slightly, and fix typos in 2 others |
174 |
'No decompressor to decompress %d bytes' % num_bytes) |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
175 |
remaining_decomp = self._z_content_decompressor.unconsumed_tail |
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
176 |
if num_bytes is None: |
177 |
if remaining_decomp: |
|
178 |
# We don't know how much is left, but we'll decompress it all
|
|
179 |
self._content += self._z_content_decompressor.decompress( |
|
180 |
remaining_decomp) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
181 |
# Note: There's what I consider a bug in zlib.decompressobj
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
182 |
# If you pass back in the entire unconsumed_tail, only
|
183 |
# this time you don't pass a max-size, it doesn't
|
|
184 |
# change the unconsumed_tail back to None/''.
|
|
185 |
# However, we know we are done with the whole stream
|
|
186 |
self._z_content_decompressor = None |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
187 |
# XXX: Why is this the only place in this routine we set this?
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
188 |
self._content_length = len(self._content) |
189 |
else: |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
190 |
if not remaining_decomp: |
191 |
raise AssertionError('Nothing left to decompress') |
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
192 |
needed_bytes = num_bytes - len(self._content) |
3735.32.12
by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types. |
193 |
# We always set max_size to 32kB over the minimum needed, so that
|
194 |
# zlib will give us as much as we really want.
|
|
195 |
# TODO: If this isn't good enough, we could make a loop here,
|
|
196 |
# that keeps expanding the request until we get enough
|
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
197 |
self._content += self._z_content_decompressor.decompress( |
198 |
remaining_decomp, needed_bytes + _ZLIB_DECOMP_WINDOW) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
199 |
if len(self._content) < num_bytes: |
200 |
raise AssertionError('%d bytes wanted, only %d available' |
|
201 |
% (num_bytes, len(self._content))) |
|
3735.32.11
by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length. |
202 |
if not self._z_content_decompressor.unconsumed_tail: |
203 |
# The stream is finished
|
|
204 |
self._z_content_decompressor = None |
|
3735.32.6
by John Arbash Meinel
A bit of reworking changes things so content is expanded at extract() time. |
205 |
|
3735.38.4
by John Arbash Meinel
Another disk format change. |
206 |
def _parse_bytes(self, bytes, pos): |
3735.32.5
by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes. |
207 |
"""Read the various lengths from the header.
|
208 |
||
209 |
This also populates the various 'compressed' buffers.
|
|
210 |
||
211 |
:return: The position in bytes just after the last newline
|
|
212 |
"""
|
|
3735.38.4
by John Arbash Meinel
Another disk format change. |
213 |
# At present, we have 2 integers for the compressed and uncompressed
|
214 |
# content. In base10 (ascii) 14 bytes can represent > 1TB, so to avoid
|
|
215 |
# checking too far, cap the search to 14 bytes.
|
|
216 |
pos2 = bytes.index('\n', pos, pos + 14) |
|
217 |
self._z_content_length = int(bytes[pos:pos2]) |
|
218 |
pos = pos2 + 1 |
|
219 |
pos2 = bytes.index('\n', pos, pos + 14) |
|
220 |
self._content_length = int(bytes[pos:pos2]) |
|
221 |
pos = pos2 + 1 |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
222 |
if len(bytes) != (pos + self._z_content_length): |
223 |
# XXX: Define some GCCorrupt error ?
|
|
224 |
raise AssertionError('Invalid bytes: (%d) != %d + %d' % |
|
225 |
(len(bytes), pos, self._z_content_length)) |
|
3735.38.4
by John Arbash Meinel
Another disk format change. |
226 |
self._z_content = bytes[pos:] |
3735.32.5
by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes. |
227 |
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
228 |
@classmethod
|
229 |
def from_bytes(cls, bytes): |
|
230 |
out = cls() |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
231 |
if bytes[:6] not in cls.GCB_KNOWN_HEADERS: |
232 |
raise ValueError('bytes did not start with any of %r' |
|
233 |
% (cls.GCB_KNOWN_HEADERS,)) |
|
234 |
# XXX: why not testing the whole header ?
|
|
0.17.44
by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups. |
235 |
if bytes[4] == 'z': |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
236 |
out._compressor_name = 'zlib' |
0.17.45
by John Arbash Meinel
Just make sure we have the right decompressor |
237 |
elif bytes[4] == 'l': |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
238 |
out._compressor_name = 'lzma' |
0.17.45
by John Arbash Meinel
Just make sure we have the right decompressor |
239 |
else: |
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
240 |
raise ValueError('unknown compressor: %r' % (bytes,)) |
3735.38.4
by John Arbash Meinel
Another disk format change. |
241 |
out._parse_bytes(bytes, 6) |
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
242 |
return out |
243 |
||
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
244 |
def extract(self, key, start, end, sha1=None): |
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
245 |
"""Extract the text for a specific key.
|
246 |
||
247 |
:param key: The label used for this content
|
|
248 |
:param sha1: TODO (should we validate only when sha1 is supplied?)
|
|
249 |
:return: The bytes for the content
|
|
250 |
"""
|
|
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
251 |
if start == end == 0: |
3735.2.158
by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract. |
252 |
return '' |
253 |
self._ensure_content(end) |
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
254 |
# The bytes are 'f' or 'd' for the type, then a variable-length
|
255 |
# base128 integer for the content size, then the actual content
|
|
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
256 |
# We know that the variable-length integer won't be longer than 5
|
257 |
# bytes (it takes 5 bytes to encode 2^32)
|
|
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
258 |
c = self._content[start] |
259 |
if c == 'f': |
|
260 |
type = 'fulltext' |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
261 |
else: |
3735.32.7
by John Arbash Meinel
Implement partial decompression support. |
262 |
if c != 'd': |
263 |
raise ValueError('Unknown content control code: %s' |
|
264 |
% (c,)) |
|
265 |
type = 'delta' |
|
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
266 |
content_len, len_len = decode_base128_int( |
267 |
self._content[start + 1:start + 6]) |
|
268 |
content_start = start + 1 + len_len |
|
3735.2.158
by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract. |
269 |
if end != content_start + content_len: |
270 |
raise ValueError('end != len according to field header' |
|
271 |
' %s != %s' % (end, content_start + content_len)) |
|
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
272 |
if c == 'f': |
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
273 |
bytes = self._content[content_start:end] |
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
274 |
elif c == 'd': |
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
275 |
bytes = apply_delta_to_source(self._content, content_start, end) |
3735.2.158
by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract. |
276 |
return bytes |
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
277 |
|
4469.1.2
by John Arbash Meinel
The only caller already knows the content length, so make the api such that |
278 |
def set_chunked_content(self, content_chunks, length): |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
279 |
"""Set the content of this block to the given chunks."""
|
4469.1.3
by John Arbash Meinel
Notes on why we do it the way we do. |
280 |
# If we have lots of short lines, it is may be more efficient to join
|
281 |
# the content ahead of time. If the content is <10MiB, we don't really
|
|
282 |
# care about the extra memory consumption, so we can just pack it and
|
|
283 |
# be done. However, timing showed 18s => 17.9s for repacking 1k revs of
|
|
284 |
# mysql, which is below the noise margin
|
|
4469.1.2
by John Arbash Meinel
The only caller already knows the content length, so make the api such that |
285 |
self._content_length = length |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
286 |
self._content_chunks = content_chunks |
4469.1.2
by John Arbash Meinel
The only caller already knows the content length, so make the api such that |
287 |
self._content = None |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
288 |
self._z_content = None |
289 |
||
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
290 |
def set_content(self, content): |
291 |
"""Set the content of this block."""
|
|
292 |
self._content_length = len(content) |
|
293 |
self._content = content |
|
294 |
self._z_content = None |
|
295 |
||
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
296 |
def _create_z_content_using_lzma(self): |
297 |
if self._content_chunks is not None: |
|
298 |
self._content = ''.join(self._content_chunks) |
|
299 |
self._content_chunks = None |
|
300 |
if self._content is None: |
|
301 |
raise AssertionError('Nothing to compress') |
|
302 |
self._z_content = pylzma.compress(self._content) |
|
303 |
self._z_content_length = len(self._z_content) |
|
304 |
||
305 |
def _create_z_content_from_chunks(self): |
|
306 |
compressor = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION) |
|
4469.1.3
by John Arbash Meinel
Notes on why we do it the way we do. |
307 |
compressed_chunks = map(compressor.compress, self._content_chunks) |
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
308 |
compressed_chunks.append(compressor.flush()) |
309 |
self._z_content = ''.join(compressed_chunks) |
|
310 |
self._z_content_length = len(self._z_content) |
|
311 |
||
312 |
def _create_z_content(self): |
|
313 |
if self._z_content is not None: |
|
314 |
return
|
|
315 |
if _USE_LZMA: |
|
316 |
self._create_z_content_using_lzma() |
|
317 |
return
|
|
318 |
if self._content_chunks is not None: |
|
319 |
self._create_z_content_from_chunks() |
|
320 |
return
|
|
321 |
self._z_content = zlib.compress(self._content) |
|
322 |
self._z_content_length = len(self._z_content) |
|
323 |
||
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
324 |
def to_bytes(self): |
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
325 |
"""Encode the information into a byte stream."""
|
4469.1.1
by John Arbash Meinel
Add a set_content_chunked member to GroupCompressBlock. |
326 |
self._create_z_content() |
0.17.46
by John Arbash Meinel
Set the proper header when using/not using lzma |
327 |
if _USE_LZMA: |
328 |
header = self.GCB_LZ_HEADER |
|
329 |
else: |
|
330 |
header = self.GCB_HEADER |
|
331 |
chunks = [header, |
|
3735.38.4
by John Arbash Meinel
Another disk format change. |
332 |
'%d\n%d\n' % (self._z_content_length, self._content_length), |
333 |
self._z_content, |
|
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
334 |
]
|
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
335 |
return ''.join(chunks) |
336 |
||
4300.1.1
by John Arbash Meinel
Add the ability to convert a gc block into 'human readable' form. |
337 |
def _dump(self, include_text=False): |
338 |
"""Take this block, and spit out a human-readable structure.
|
|
339 |
||
340 |
:param include_text: Inserts also include text bits, chose whether you
|
|
341 |
want this displayed in the dump or not.
|
|
342 |
:return: A dump of the given block. The layout is something like:
|
|
343 |
[('f', length), ('d', delta_length, text_length, [delta_info])]
|
|
344 |
delta_info := [('i', num_bytes, text), ('c', offset, num_bytes),
|
|
345 |
...]
|
|
346 |
"""
|
|
347 |
self._ensure_content() |
|
348 |
result = [] |
|
349 |
pos = 0 |
|
350 |
while pos < self._content_length: |
|
351 |
kind = self._content[pos] |
|
352 |
pos += 1 |
|
353 |
if kind not in ('f', 'd'): |
|
354 |
raise ValueError('invalid kind character: %r' % (kind,)) |
|
355 |
content_len, len_len = decode_base128_int( |
|
356 |
self._content[pos:pos + 5]) |
|
357 |
pos += len_len |
|
358 |
if content_len + pos > self._content_length: |
|
359 |
raise ValueError('invalid content_len %d for record @ pos %d' |
|
360 |
% (content_len, pos - len_len - 1)) |
|
361 |
if kind == 'f': # Fulltext |
|
4398.5.6
by John Arbash Meinel
A bit more debugging information from gcblock._dump(True) |
362 |
if include_text: |
363 |
text = self._content[pos:pos+content_len] |
|
364 |
result.append(('f', content_len, text)) |
|
365 |
else: |
|
366 |
result.append(('f', content_len)) |
|
4300.1.1
by John Arbash Meinel
Add the ability to convert a gc block into 'human readable' form. |
367 |
elif kind == 'd': # Delta |
368 |
delta_content = self._content[pos:pos+content_len] |
|
369 |
delta_info = [] |
|
370 |
# The first entry in a delta is the decompressed length
|
|
371 |
decomp_len, delta_pos = decode_base128_int(delta_content) |
|
372 |
result.append(('d', content_len, decomp_len, delta_info)) |
|
373 |
measured_len = 0 |
|
374 |
while delta_pos < content_len: |
|
375 |
c = ord(delta_content[delta_pos]) |
|
376 |
delta_pos += 1 |
|
377 |
if c & 0x80: # Copy |
|
378 |
(offset, length, |
|
379 |
delta_pos) = decode_copy_instruction(delta_content, c, |
|
380 |
delta_pos) |
|
4398.5.6
by John Arbash Meinel
A bit more debugging information from gcblock._dump(True) |
381 |
if include_text: |
382 |
text = self._content[offset:offset+length] |
|
383 |
delta_info.append(('c', offset, length, text)) |
|
384 |
else: |
|
385 |
delta_info.append(('c', offset, length)) |
|
4300.1.1
by John Arbash Meinel
Add the ability to convert a gc block into 'human readable' form. |
386 |
measured_len += length |
387 |
else: # Insert |
|
388 |
if include_text: |
|
389 |
txt = delta_content[delta_pos:delta_pos+c] |
|
390 |
else: |
|
391 |
txt = '' |
|
392 |
delta_info.append(('i', c, txt)) |
|
393 |
measured_len += c |
|
394 |
delta_pos += c |
|
395 |
if delta_pos != content_len: |
|
396 |
raise ValueError('Delta consumed a bad number of bytes:' |
|
397 |
' %d != %d' % (delta_pos, content_len)) |
|
398 |
if measured_len != decomp_len: |
|
399 |
raise ValueError('Delta claimed fulltext was %d bytes, but' |
|
400 |
' extraction resulted in %d bytes' |
|
401 |
% (decomp_len, measured_len)) |
|
402 |
pos += content_len |
|
403 |
return result |
|
404 |
||
0.25.2
by John Arbash Meinel
First cut at meta-info as text form. |
405 |
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
406 |
class _LazyGroupCompressFactory(object): |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
407 |
"""Yield content from a GroupCompressBlock on demand."""
|
408 |
||
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
409 |
def __init__(self, key, parents, manager, start, end, first): |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
410 |
"""Create a _LazyGroupCompressFactory
|
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
411 |
|
412 |
:param key: The key of just this record
|
|
413 |
:param parents: The parents of this key (possibly None)
|
|
414 |
:param gc_block: A GroupCompressBlock object
|
|
415 |
:param start: Offset of the first byte for this record in the
|
|
416 |
uncompressd content
|
|
417 |
:param end: Offset of the byte just after the end of this record
|
|
418 |
(ie, bytes = content[start:end])
|
|
419 |
:param first: Is this the first Factory for the given block?
|
|
420 |
"""
|
|
421 |
self.key = key |
|
422 |
self.parents = parents |
|
423 |
self.sha1 = None |
|
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
424 |
# Note: This attribute coupled with Manager._factories creates a
|
425 |
# reference cycle. Perhaps we would rather use a weakref(), or
|
|
426 |
# find an appropriate time to release the ref. After the first
|
|
427 |
# get_bytes_as call? After Manager.get_record_stream() returns
|
|
428 |
# the object?
|
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
429 |
self._manager = manager |
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
430 |
self._bytes = None |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
431 |
self.storage_kind = 'groupcompress-block' |
432 |
if not first: |
|
433 |
self.storage_kind = 'groupcompress-block-ref' |
|
434 |
self._first = first |
|
435 |
self._start = start |
|
436 |
self._end = end |
|
437 |
||
3735.32.12
by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types. |
438 |
def __repr__(self): |
439 |
return '%s(%s, first=%s)' % (self.__class__.__name__, |
|
440 |
self.key, self._first) |
|
441 |
||
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
442 |
def get_bytes_as(self, storage_kind): |
443 |
if storage_kind == self.storage_kind: |
|
444 |
if self._first: |
|
445 |
# wire bytes, something...
|
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
446 |
return self._manager._wire_bytes() |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
447 |
else: |
448 |
return '' |
|
449 |
if storage_kind in ('fulltext', 'chunked'): |
|
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
450 |
if self._bytes is None: |
3735.34.3
by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core. |
451 |
# Grab and cache the raw bytes for this entry
|
452 |
# and break the ref-cycle with _manager since we don't need it
|
|
453 |
# anymore
|
|
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
454 |
self._manager._prepare_for_extract() |
455 |
block = self._manager._block |
|
3735.34.2
by John Arbash Meinel
Merge brisbane-core tip, resolve differences. |
456 |
self._bytes = block.extract(self.key, self._start, self._end) |
3735.37.5
by John Arbash Meinel
Restore the refcycle reduction code. |
457 |
# There are code paths that first extract as fulltext, and then
|
458 |
# extract as storage_kind (smart fetch). So we don't break the
|
|
459 |
# refcycle here, but instead in manager.get_record_stream()
|
|
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
460 |
if storage_kind == 'fulltext': |
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
461 |
return self._bytes |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
462 |
else: |
3735.34.1
by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit. |
463 |
return [self._bytes] |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
464 |
raise errors.UnavailableRepresentation(self.key, storage_kind, |
3735.34.3
by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core. |
465 |
self.storage_kind) |
3735.32.8
by John Arbash Meinel
Some tests for the LazyGroupCompressFactory |
466 |
|
467 |
||
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
468 |
class _LazyGroupContentManager(object): |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
469 |
"""This manages a group of _LazyGroupCompressFactory objects."""
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
470 |
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
471 |
_max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of |
472 |
# current size, and still be considered
|
|
473 |
# resuable
|
|
474 |
_full_block_size = 4*1024*1024 |
|
475 |
_full_mixed_block_size = 2*1024*1024 |
|
476 |
_full_enough_block_size = 3*1024*1024 # size at which we won't repack |
|
477 |
_full_enough_mixed_block_size = 2*768*1024 # 1.5MB |
|
478 |
||
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
479 |
def __init__(self, block): |
480 |
self._block = block |
|
481 |
# We need to preserve the ordering
|
|
482 |
self._factories = [] |
|
3735.32.27
by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds. |
483 |
self._last_byte = 0 |
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
484 |
|
485 |
def add_factory(self, key, parents, start, end): |
|
486 |
if not self._factories: |
|
487 |
first = True |
|
488 |
else: |
|
489 |
first = False |
|
490 |
# Note that this creates a reference cycle....
|
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
491 |
factory = _LazyGroupCompressFactory(key, parents, self, |
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
492 |
start, end, first=first) |
3735.36.13
by John Arbash Meinel
max() shows up under lsprof as more expensive than creating an object. |
493 |
# max() works here, but as a function call, doing a compare seems to be
|
494 |
# significantly faster, timeit says 250ms for max() and 100ms for the
|
|
495 |
# comparison
|
|
496 |
if end > self._last_byte: |
|
497 |
self._last_byte = end |
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
498 |
self._factories.append(factory) |
499 |
||
500 |
def get_record_stream(self): |
|
501 |
"""Get a record for all keys added so far."""
|
|
502 |
for factory in self._factories: |
|
503 |
yield factory |
|
3735.34.3
by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core. |
504 |
# Break the ref-cycle
|
3735.34.2
by John Arbash Meinel
Merge brisbane-core tip, resolve differences. |
505 |
factory._bytes = None |
3735.37.5
by John Arbash Meinel
Restore the refcycle reduction code. |
506 |
factory._manager = None |
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
507 |
# TODO: Consider setting self._factories = None after the above loop,
|
508 |
# as it will break the reference cycle
|
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
509 |
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
510 |
def _trim_block(self, last_byte): |
511 |
"""Create a new GroupCompressBlock, with just some of the content."""
|
|
512 |
# None of the factories need to be adjusted, because the content is
|
|
513 |
# located in an identical place. Just that some of the unreferenced
|
|
514 |
# trailing bytes are stripped
|
|
515 |
trace.mutter('stripping trailing bytes from groupcompress block' |
|
516 |
' %d => %d', self._block._content_length, last_byte) |
|
517 |
new_block = GroupCompressBlock() |
|
518 |
self._block._ensure_content(last_byte) |
|
519 |
new_block.set_content(self._block._content[:last_byte]) |
|
520 |
self._block = new_block |
|
521 |
||
522 |
def _rebuild_block(self): |
|
523 |
"""Create a new GroupCompressBlock with only the referenced texts."""
|
|
524 |
compressor = GroupCompressor() |
|
525 |
tstart = time.time() |
|
526 |
old_length = self._block._content_length |
|
3735.2.162
by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point. |
527 |
end_point = 0 |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
528 |
for factory in self._factories: |
529 |
bytes = factory.get_bytes_as('fulltext') |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
530 |
(found_sha1, start_point, end_point, |
531 |
type) = compressor.compress(factory.key, bytes, factory.sha1) |
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
532 |
# Now update this factory with the new offsets, etc
|
533 |
factory.sha1 = found_sha1 |
|
3735.2.162
by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point. |
534 |
factory._start = start_point |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
535 |
factory._end = end_point |
3735.2.162
by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point. |
536 |
self._last_byte = end_point |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
537 |
new_block = compressor.flush() |
538 |
# TODO: Should we check that new_block really *is* smaller than the old
|
|
539 |
# block? It seems hard to come up with a method that it would
|
|
540 |
# expand, since we do full compression again. Perhaps based on a
|
|
541 |
# request that ends up poorly ordered?
|
|
542 |
delta = time.time() - tstart |
|
543 |
self._block = new_block |
|
4641.4.2
by John Arbash Meinel
Use unordered fetches to avoid fragmentation (bug #402645) |
544 |
trace.mutter('creating new compressed block on-the-fly in %.3fs' |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
545 |
' %d bytes => %d bytes', delta, old_length, |
546 |
self._block._content_length) |
|
547 |
||
3735.32.27
by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds. |
548 |
def _prepare_for_extract(self): |
549 |
"""A _LazyGroupCompressFactory is about to extract to fulltext."""
|
|
550 |
# We expect that if one child is going to fulltext, all will be. This
|
|
551 |
# helps prevent all of them from extracting a small amount at a time.
|
|
552 |
# Which in itself isn't terribly expensive, but resizing 2MB 32kB at a
|
|
553 |
# time (self._block._content) is a little expensive.
|
|
554 |
self._block._ensure_content(self._last_byte) |
|
555 |
||
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
556 |
def _check_rebuild_action(self): |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
557 |
"""Check to see if our block should be repacked."""
|
558 |
total_bytes_used = 0 |
|
559 |
last_byte_used = 0 |
|
560 |
for factory in self._factories: |
|
561 |
total_bytes_used += factory._end - factory._start |
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
562 |
if last_byte_used < factory._end: |
563 |
last_byte_used = factory._end |
|
564 |
# If we are using more than half of the bytes from the block, we have
|
|
565 |
# nothing else to check
|
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
566 |
if total_bytes_used * 2 >= self._block._content_length: |
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
567 |
return None, last_byte_used, total_bytes_used |
568 |
# We are using less than 50% of the content. Is the content we are
|
|
569 |
# using at the beginning of the block? If so, we can just trim the
|
|
570 |
# tail, rather than rebuilding from scratch.
|
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
571 |
if total_bytes_used * 2 > last_byte_used: |
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
572 |
return 'trim', last_byte_used, total_bytes_used |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
573 |
|
574 |
# We are using a small amount of the data, and it isn't just packed
|
|
575 |
# nicely at the front, so rebuild the content.
|
|
576 |
# Note: This would be *nicer* as a strip-data-from-group, rather than
|
|
577 |
# building it up again from scratch
|
|
578 |
# It might be reasonable to consider the fulltext sizes for
|
|
579 |
# different bits when deciding this, too. As you may have a small
|
|
580 |
# fulltext, and a trivial delta, and you are just trading around
|
|
581 |
# for another fulltext. If we do a simple 'prune' you may end up
|
|
582 |
# expanding many deltas into fulltexts, as well.
|
|
583 |
# If we build a cheap enough 'strip', then we could try a strip,
|
|
584 |
# if that expands the content, we then rebuild.
|
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
585 |
return 'rebuild', last_byte_used, total_bytes_used |
586 |
||
587 |
def check_is_well_utilized(self): |
|
588 |
"""Is the current block considered 'well utilized'?
|
|
589 |
||
590 |
This heuristic asks if the current block considers itself to be a fully
|
|
591 |
developed group, rather than just a loose collection of data.
|
|
592 |
"""
|
|
593 |
if len(self._factories) == 1: |
|
594 |
# A block of length 1 could be improved by combining with other
|
|
595 |
# groups - don't look deeper. Even larger than max size groups
|
|
596 |
# could compress well with adjacent versions of the same thing.
|
|
597 |
return False |
|
598 |
action, last_byte_used, total_bytes_used = self._check_rebuild_action() |
|
599 |
block_size = self._block._content_length |
|
600 |
if total_bytes_used < block_size * self._max_cut_fraction: |
|
601 |
# This block wants to trim itself small enough that we want to
|
|
602 |
# consider it under-utilized.
|
|
603 |
return False |
|
604 |
# TODO: This code is meant to be the twin of _insert_record_stream's
|
|
605 |
# 'start_new_block' logic. It would probably be better to factor
|
|
606 |
# out that logic into a shared location, so that it stays
|
|
607 |
# together better
|
|
608 |
# We currently assume a block is properly utilized whenever it is >75%
|
|
609 |
# of the size of a 'full' block. In normal operation, a block is
|
|
610 |
# considered full when it hits 4MB of same-file content. So any block
|
|
611 |
# >3MB is 'full enough'.
|
|
612 |
# The only time this isn't true is when a given block has large-object
|
|
613 |
# content. (a single file >4MB, etc.)
|
|
614 |
# Under these circumstances, we allow a block to grow to
|
|
615 |
# 2 x largest_content. Which means that if a given block had a large
|
|
616 |
# object, it may actually be under-utilized. However, given that this
|
|
617 |
# is 'pack-on-the-fly' it is probably reasonable to not repack large
|
|
618 |
# content blobs on-the-fly. Note that because we return False for all
|
|
619 |
# 1-item blobs, we will repack them; we may wish to reevaluate our
|
|
620 |
# treatment of large object blobs in the future.
|
|
621 |
if block_size >= self._full_enough_block_size: |
|
622 |
return True |
|
623 |
# If a block is <3MB, it still may be considered 'full' if it contains
|
|
624 |
# mixed content. The current rule is 2MB of mixed content is considered
|
|
625 |
# full. So check to see if this block contains mixed content, and
|
|
626 |
# set the threshold appropriately.
|
|
627 |
common_prefix = None |
|
628 |
for factory in self._factories: |
|
629 |
prefix = factory.key[:-1] |
|
630 |
if common_prefix is None: |
|
631 |
common_prefix = prefix |
|
632 |
elif prefix != common_prefix: |
|
633 |
# Mixed content, check the size appropriately
|
|
634 |
if block_size >= self._full_enough_mixed_block_size: |
|
635 |
return True |
|
636 |
break
|
|
637 |
# The content failed both the mixed check and the single-content check
|
|
638 |
# so obviously it is not fully utilized
|
|
639 |
# TODO: there is one other constraint that isn't being checked
|
|
640 |
# namely, that the entries in the block are in the appropriate
|
|
641 |
# order. For example, you could insert the entries in exactly
|
|
642 |
# reverse groupcompress order, and we would think that is ok.
|
|
643 |
# (all the right objects are in one group, and it is fully
|
|
644 |
# utilized, etc.) For now, we assume that case is rare,
|
|
645 |
# especially since we should always fetch in 'groupcompress'
|
|
646 |
# order.
|
|
647 |
return False |
|
648 |
||
649 |
def _check_rebuild_block(self): |
|
650 |
action, last_byte_used, total_bytes_used = self._check_rebuild_action() |
|
651 |
if action is None: |
|
652 |
return
|
|
653 |
if action == 'trim': |
|
654 |
self._trim_block(last_byte_used) |
|
655 |
elif action == 'rebuild': |
|
656 |
self._rebuild_block() |
|
657 |
else: |
|
658 |
raise ValueError('unknown rebuild action: %r' % (action,)) |
|
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
659 |
|
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
660 |
def _wire_bytes(self): |
661 |
"""Return a byte stream suitable for transmitting over the wire."""
|
|
3735.32.24
by John Arbash Meinel
_wire_bytes() now strips groups as necessary, as does _insert_record_stream |
662 |
self._check_rebuild_block() |
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
663 |
# The outer block starts with:
|
664 |
# 'groupcompress-block\n'
|
|
665 |
# <length of compressed key info>\n
|
|
666 |
# <length of uncompressed info>\n
|
|
667 |
# <length of gc block>\n
|
|
668 |
# <header bytes>
|
|
669 |
# <gc-block>
|
|
670 |
lines = ['groupcompress-block\n'] |
|
671 |
# The minimal info we need is the key, the start offset, and the
|
|
672 |
# parents. The length and type are encoded in the record itself.
|
|
673 |
# However, passing in the other bits makes it easier. The list of
|
|
674 |
# keys, and the start offset, the length
|
|
675 |
# 1 line key
|
|
676 |
# 1 line with parents, '' for ()
|
|
677 |
# 1 line for start offset
|
|
678 |
# 1 line for end byte
|
|
679 |
header_lines = [] |
|
3735.32.15
by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'. |
680 |
for factory in self._factories: |
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
681 |
key_bytes = '\x00'.join(factory.key) |
682 |
parents = factory.parents |
|
683 |
if parents is None: |
|
684 |
parent_bytes = 'None:' |
|
685 |
else: |
|
686 |
parent_bytes = '\t'.join('\x00'.join(key) for key in parents) |
|
687 |
record_header = '%s\n%s\n%d\n%d\n' % ( |
|
688 |
key_bytes, parent_bytes, factory._start, factory._end) |
|
689 |
header_lines.append(record_header) |
|
3735.37.5
by John Arbash Meinel
Restore the refcycle reduction code. |
690 |
# TODO: Can we break the refcycle at this point and set
|
691 |
# factory._manager = None?
|
|
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
692 |
header_bytes = ''.join(header_lines) |
693 |
del header_lines |
|
694 |
header_bytes_len = len(header_bytes) |
|
695 |
z_header_bytes = zlib.compress(header_bytes) |
|
696 |
del header_bytes |
|
697 |
z_header_bytes_len = len(z_header_bytes) |
|
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
698 |
block_bytes = self._block.to_bytes() |
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
699 |
lines.append('%d\n%d\n%d\n' % (z_header_bytes_len, header_bytes_len, |
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
700 |
len(block_bytes))) |
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
701 |
lines.append(z_header_bytes) |
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
702 |
lines.append(block_bytes) |
703 |
del z_header_bytes, block_bytes |
|
3735.32.16
by John Arbash Meinel
We now have a general header for the GC block. |
704 |
return ''.join(lines) |
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
705 |
|
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
706 |
@classmethod
|
3735.32.18
by John Arbash Meinel
We now support generating a network stream. |
707 |
def from_bytes(cls, bytes): |
3735.32.17
by John Arbash Meinel
We now round-trip the wire_bytes. |
708 |
# TODO: This does extra string copying, probably better to do it a
|
709 |
# different way
|
|
710 |
(storage_kind, z_header_len, header_len, |
|
711 |
block_len, rest) = bytes.split('\n', 4) |
|
712 |
del bytes |
|
713 |
if storage_kind != 'groupcompress-block': |
|
714 |
raise ValueError('Unknown storage kind: %s' % (storage_kind,)) |
|
715 |
z_header_len = int(z_header_len) |
|
716 |
if len(rest) < z_header_len: |
|
717 |
raise ValueError('Compressed header len shorter than all bytes') |
|
718 |
z_header = rest[:z_header_len] |
|
719 |
header_len = int(header_len) |
|
720 |
header = zlib.decompress(z_header) |
|
721 |
if len(header) != header_len: |
|
722 |
raise ValueError('invalid length for decompressed bytes') |
|
723 |
del z_header |
|
724 |
block_len = int(block_len) |
|
725 |
if len(rest) != z_header_len + block_len: |
|
726 |
raise ValueError('Invalid length for block') |
|
727 |
block_bytes = rest[z_header_len:] |
|
728 |
del rest |
|
729 |
# So now we have a valid GCB, we just need to parse the factories that
|
|
730 |
# were sent to us
|
|
731 |
header_lines = header.split('\n') |
|
732 |
del header |
|
733 |
last = header_lines.pop() |
|
734 |
if last != '': |
|
735 |
raise ValueError('header lines did not end with a trailing' |
|
736 |
' newline') |
|
737 |
if len(header_lines) % 4 != 0: |
|
738 |
raise ValueError('The header was not an even multiple of 4 lines') |
|
739 |
block = GroupCompressBlock.from_bytes(block_bytes) |
|
740 |
del block_bytes |
|
741 |
result = cls(block) |
|
742 |
for start in xrange(0, len(header_lines), 4): |
|
743 |
# intern()?
|
|
744 |
key = tuple(header_lines[start].split('\x00')) |
|
745 |
parents_line = header_lines[start+1] |
|
746 |
if parents_line == 'None:': |
|
747 |
parents = None |
|
748 |
else: |
|
749 |
parents = tuple([tuple(segment.split('\x00')) |
|
750 |
for segment in parents_line.split('\t') |
|
751 |
if segment]) |
|
752 |
start_offset = int(header_lines[start+2]) |
|
753 |
end_offset = int(header_lines[start+3]) |
|
754 |
result.add_factory(key, parents, start_offset, end_offset) |
|
755 |
return result |
|
756 |
||
3735.32.14
by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object. |
757 |
|
3735.32.18
by John Arbash Meinel
We now support generating a network stream. |
758 |
def network_block_to_records(storage_kind, bytes, line_end): |
759 |
if storage_kind != 'groupcompress-block': |
|
760 |
raise ValueError('Unknown storage kind: %s' % (storage_kind,)) |
|
761 |
manager = _LazyGroupContentManager.from_bytes(bytes) |
|
762 |
return manager.get_record_stream() |
|
763 |
||
764 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
765 |
class _CommonGroupCompressor(object): |
766 |
||
767 |
def __init__(self): |
|
768 |
"""Create a GroupCompressor."""
|
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
769 |
self.chunks = [] |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
770 |
self._last = None |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
771 |
self.endpoint = 0 |
772 |
self.input_bytes = 0 |
|
773 |
self.labels_deltas = {} |
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
774 |
self._delta_index = None # Set by the children |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
775 |
self._block = GroupCompressBlock() |
776 |
||
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
777 |
def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False): |
778 |
"""Compress lines with label key.
|
|
779 |
||
780 |
:param key: A key tuple. It is stored in the output
|
|
781 |
for identification of the text during decompression. If the last
|
|
782 |
element is 'None' it is replaced with the sha1 of the text -
|
|
783 |
e.g. sha1:xxxxxxx.
|
|
784 |
:param bytes: The bytes to be compressed
|
|
785 |
:param expected_sha: If non-None, the sha the lines are believed to
|
|
786 |
have. During compression the sha is calculated; a mismatch will
|
|
787 |
cause an error.
|
|
788 |
:param nostore_sha: If the computed sha1 sum matches, we will raise
|
|
789 |
ExistingContent rather than adding the text.
|
|
790 |
:param soft: Do a 'soft' compression. This means that we require larger
|
|
791 |
ranges to match to be considered for a copy command.
|
|
792 |
||
793 |
:return: The sha1 of lines, the start and end offsets in the delta, and
|
|
794 |
the type ('fulltext' or 'delta').
|
|
795 |
||
796 |
:seealso VersionedFiles.add_lines:
|
|
797 |
"""
|
|
798 |
if not bytes: # empty, like a dir entry, etc |
|
799 |
if nostore_sha == _null_sha1: |
|
800 |
raise errors.ExistingContent() |
|
801 |
return _null_sha1, 0, 0, 'fulltext' |
|
802 |
# we assume someone knew what they were doing when they passed it in
|
|
803 |
if expected_sha is not None: |
|
804 |
sha1 = expected_sha |
|
805 |
else: |
|
806 |
sha1 = osutils.sha_string(bytes) |
|
807 |
if nostore_sha is not None: |
|
808 |
if sha1 == nostore_sha: |
|
809 |
raise errors.ExistingContent() |
|
810 |
if key[-1] is None: |
|
811 |
key = key[:-1] + ('sha1:' + sha1,) |
|
812 |
||
813 |
start, end, type = self._compress(key, bytes, len(bytes) / 2, soft) |
|
814 |
return sha1, start, end, type |
|
815 |
||
816 |
def _compress(self, key, bytes, max_delta_size, soft=False): |
|
817 |
"""Compress lines with label key.
|
|
818 |
||
819 |
:param key: A key tuple. It is stored in the output for identification
|
|
820 |
of the text during decompression.
|
|
821 |
||
822 |
:param bytes: The bytes to be compressed
|
|
823 |
||
824 |
:param max_delta_size: The size above which we issue a fulltext instead
|
|
825 |
of a delta.
|
|
826 |
||
827 |
:param soft: Do a 'soft' compression. This means that we require larger
|
|
828 |
ranges to match to be considered for a copy command.
|
|
829 |
||
830 |
:return: The sha1 of lines, the start and end offsets in the delta, and
|
|
831 |
the type ('fulltext' or 'delta').
|
|
832 |
"""
|
|
833 |
raise NotImplementedError(self._compress) |
|
834 |
||
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
835 |
def extract(self, key): |
836 |
"""Extract a key previously added to the compressor.
|
|
837 |
||
838 |
:param key: The key to extract.
|
|
839 |
:return: An iterable over bytes and the sha1.
|
|
840 |
"""
|
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
841 |
(start_byte, start_chunk, end_byte, end_chunk) = self.labels_deltas[key] |
842 |
delta_chunks = self.chunks[start_chunk:end_chunk] |
|
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
843 |
stored_bytes = ''.join(delta_chunks) |
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
844 |
if stored_bytes[0] == 'f': |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
845 |
fulltext_len, offset = decode_base128_int(stored_bytes[1:10]) |
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
846 |
data_len = fulltext_len + 1 + offset |
847 |
if data_len != len(stored_bytes): |
|
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
848 |
raise ValueError('Index claimed fulltext len, but stored bytes' |
849 |
' claim %s != %s' |
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
850 |
% (len(stored_bytes), data_len)) |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
851 |
bytes = stored_bytes[offset + 1:] |
852 |
else: |
|
853 |
# XXX: This is inefficient at best
|
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
854 |
source = ''.join(self.chunks[:start_chunk]) |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
855 |
if stored_bytes[0] != 'd': |
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
856 |
raise ValueError('Unknown content kind, bytes claim %s' |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
857 |
% (stored_bytes[0],)) |
858 |
delta_len, offset = decode_base128_int(stored_bytes[1:10]) |
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
859 |
data_len = delta_len + 1 + offset |
860 |
if data_len != len(stored_bytes): |
|
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
861 |
raise ValueError('Index claimed delta len, but stored bytes' |
862 |
' claim %s != %s' |
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
863 |
% (len(stored_bytes), data_len)) |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
864 |
bytes = apply_delta(source, stored_bytes[offset + 1:]) |
865 |
bytes_sha1 = osutils.sha_string(bytes) |
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
866 |
return bytes, bytes_sha1 |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
867 |
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
868 |
def flush(self): |
869 |
"""Finish this group, creating a formatted stream.
|
|
870 |
||
871 |
After calling this, the compressor should no longer be used
|
|
872 |
"""
|
|
4398.6.2
by John Arbash Meinel
Add a TODO, marking the code that causes us to peak at 2x memory consumption |
873 |
# TODO: this causes us to 'bloat' to 2x the size of content in the
|
874 |
# group. This has an impact for 'commit' of large objects.
|
|
875 |
# One possibility is to use self._content_chunks, and be lazy and
|
|
876 |
# only fill out self._content as a full string when we actually
|
|
877 |
# need it. That would at least drop the peak memory consumption
|
|
878 |
# for 'commit' down to ~1x the size of the largest file, at a
|
|
879 |
# cost of increased complexity within this code. 2x is still <<
|
|
880 |
# 3x the size of the largest file, so we are doing ok.
|
|
4469.1.2
by John Arbash Meinel
The only caller already knows the content length, so make the api such that |
881 |
self._block.set_chunked_content(self.chunks, self.endpoint) |
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
882 |
self.chunks = None |
883 |
self._delta_index = None |
|
884 |
return self._block |
|
885 |
||
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
886 |
def pop_last(self): |
887 |
"""Call this if you want to 'revoke' the last compression.
|
|
888 |
||
889 |
After this, the data structures will be rolled back, but you cannot do
|
|
890 |
more compression.
|
|
891 |
"""
|
|
892 |
self._delta_index = None |
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
893 |
del self.chunks[self._last[0]:] |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
894 |
self.endpoint = self._last[1] |
895 |
self._last = None |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
896 |
|
897 |
def ratio(self): |
|
898 |
"""Return the overall compression ratio."""
|
|
899 |
return float(self.input_bytes) / float(self.endpoint) |
|
900 |
||
901 |
||
902 |
class PythonGroupCompressor(_CommonGroupCompressor): |
|
903 |
||
3735.40.2
by John Arbash Meinel
Add a groupcompress.encode_copy_instruction function. |
904 |
def __init__(self): |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
905 |
"""Create a GroupCompressor.
|
906 |
||
907 |
Used only if the pyrex version is not available.
|
|
908 |
"""
|
|
909 |
super(PythonGroupCompressor, self).__init__() |
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
910 |
self._delta_index = LinesDeltaIndex([]) |
911 |
# The actual content is managed by LinesDeltaIndex
|
|
912 |
self.chunks = self._delta_index.lines |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
913 |
|
914 |
def _compress(self, key, bytes, max_delta_size, soft=False): |
|
915 |
"""see _CommonGroupCompressor._compress"""
|
|
916 |
input_len = len(bytes) |
|
3735.40.2
by John Arbash Meinel
Add a groupcompress.encode_copy_instruction function. |
917 |
new_lines = osutils.split_lines(bytes) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
918 |
out_lines, index_lines = self._delta_index.make_delta( |
919 |
new_lines, bytes_length=input_len, soft=soft) |
|
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
920 |
delta_length = sum(map(len, out_lines)) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
921 |
if delta_length > max_delta_size: |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
922 |
# The delta is longer than the fulltext, insert a fulltext
|
923 |
type = 'fulltext' |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
924 |
out_lines = ['f', encode_base128_int(input_len)] |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
925 |
out_lines.extend(new_lines) |
926 |
index_lines = [False, False] |
|
927 |
index_lines.extend([True] * len(new_lines)) |
|
928 |
else: |
|
929 |
# this is a worthy delta, output it
|
|
930 |
type = 'delta' |
|
931 |
out_lines[0] = 'd' |
|
932 |
# Update the delta_length to include those two encoded integers
|
|
933 |
out_lines[1] = encode_base128_int(delta_length) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
934 |
# Before insertion
|
935 |
start = self.endpoint |
|
936 |
chunk_start = len(self.chunks) |
|
4241.17.2
by John Arbash Meinel
PythonGroupCompressor needs to support pop_last() properly. |
937 |
self._last = (chunk_start, self.endpoint) |
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
938 |
self._delta_index.extend_lines(out_lines, index_lines) |
939 |
self.endpoint = self._delta_index.endpoint |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
940 |
self.input_bytes += input_len |
941 |
chunk_end = len(self.chunks) |
|
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
942 |
self.labels_deltas[key] = (start, chunk_start, |
943 |
self.endpoint, chunk_end) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
944 |
return start, self.endpoint, type |
945 |
||
946 |
||
947 |
class PyrexGroupCompressor(_CommonGroupCompressor): |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
948 |
"""Produce a serialised group of compressed texts.
|
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
949 |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
950 |
It contains code very similar to SequenceMatcher because of having a similar
|
951 |
task. However some key differences apply:
|
|
952 |
- there is no junk, we want a minimal edit not a human readable diff.
|
|
953 |
- we don't filter very common lines (because we don't know where a good
|
|
954 |
range will start, and after the first text we want to be emitting minmal
|
|
955 |
edits only.
|
|
956 |
- we chain the left side, not the right side
|
|
957 |
- we incrementally update the adjacency matrix as new lines are provided.
|
|
958 |
- we look for matches in all of the left side, so the routine which does
|
|
959 |
the analagous task of find_longest_match does not need to filter on the
|
|
960 |
left side.
|
|
961 |
"""
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
962 |
|
3735.32.19
by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway. |
963 |
def __init__(self): |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
964 |
super(PyrexGroupCompressor, self).__init__() |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
965 |
self._delta_index = DeltaIndex() |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
966 |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
967 |
def _compress(self, key, bytes, max_delta_size, soft=False): |
968 |
"""see _CommonGroupCompressor._compress"""
|
|
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
969 |
input_len = len(bytes) |
0.23.12
by John Arbash Meinel
Add a 'len:' field to the data. |
970 |
# By having action/label/sha1/len, we can parse the group if the index
|
971 |
# was ever destroyed, we have the key in 'label', we know the final
|
|
972 |
# bytes are valid from sha1, and we know where to find the end of this
|
|
973 |
# record because of 'len'. (the delta record itself will store the
|
|
974 |
# total length for the expanded record)
|
|
0.23.13
by John Arbash Meinel
Factor out the ability to have/not have labels. |
975 |
# 'len: %d\n' costs approximately 1% increase in total data
|
976 |
# Having the labels at all costs us 9-10% increase, 38% increase for
|
|
977 |
# inventory pages, and 5.8% increase for text pages
|
|
0.25.6
by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header |
978 |
# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
|
0.23.33
by John Arbash Meinel
Fix a bug when handling multiple large-range copies. |
979 |
if self._delta_index._source_offset != self.endpoint: |
980 |
raise AssertionError('_source_offset != endpoint' |
|
981 |
' somehow the DeltaIndex got out of sync with'
|
|
982 |
' the output lines') |
|
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
983 |
delta = self._delta_index.make_delta(bytes, max_delta_size) |
984 |
if (delta is None): |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
985 |
type = 'fulltext' |
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
986 |
enc_length = encode_base128_int(len(bytes)) |
987 |
len_mini_header = 1 + len(enc_length) |
|
988 |
self._delta_index.add_source(bytes, len_mini_header) |
|
989 |
new_chunks = ['f', enc_length, bytes] |
|
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
990 |
else: |
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
991 |
type = 'delta' |
0.17.36
by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes |
992 |
enc_length = encode_base128_int(len(delta)) |
993 |
len_mini_header = 1 + len(enc_length) |
|
994 |
new_chunks = ['d', enc_length, delta] |
|
3735.38.5
by John Arbash Meinel
A bit of testing showed that _FAST=True was actually *slower*. |
995 |
self._delta_index.add_delta_source(delta, len_mini_header) |
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
996 |
# Before insertion
|
997 |
start = self.endpoint |
|
998 |
chunk_start = len(self.chunks) |
|
999 |
# Now output these bytes
|
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
1000 |
self._output_chunks(new_chunks) |
0.23.6
by John Arbash Meinel
Start stripping out the actual GroupCompressor |
1001 |
self.input_bytes += input_len |
3735.40.18
by John Arbash Meinel
Get rid of the entries dict in GroupCompressBlock. |
1002 |
chunk_end = len(self.chunks) |
1003 |
self.labels_deltas[key] = (start, chunk_start, |
|
1004 |
self.endpoint, chunk_end) |
|
0.23.29
by John Arbash Meinel
Forgot to add the delta bytes to the index objects. |
1005 |
if not self._delta_index._source_offset == self.endpoint: |
1006 |
raise AssertionError('the delta index is out of sync' |
|
1007 |
'with the output lines %s != %s' |
|
1008 |
% (self._delta_index._source_offset, self.endpoint)) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1009 |
return start, self.endpoint, type |
0.17.2
by Robert Collins
Core proof of concept working. |
1010 |
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
1011 |
def _output_chunks(self, new_chunks): |
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
1012 |
"""Output some chunks.
|
1013 |
||
1014 |
:param new_chunks: The chunks to output.
|
|
1015 |
"""
|
|
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
1016 |
self._last = (len(self.chunks), self.endpoint) |
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
1017 |
endpoint = self.endpoint |
3735.40.17
by John Arbash Meinel
Change the attribute from 'lines' to 'chunks' to make it more |
1018 |
self.chunks.extend(new_chunks) |
0.23.9
by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor. |
1019 |
endpoint += sum(map(len, new_chunks)) |
0.17.12
by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead. |
1020 |
self.endpoint = endpoint |
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
1021 |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
1022 |
|
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1023 |
def make_pack_factory(graph, delta, keylength, inconsistency_fatal=True): |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1024 |
"""Create a factory for creating a pack based groupcompress.
|
1025 |
||
1026 |
This is only functional enough to run interface tests, it doesn't try to
|
|
1027 |
provide a full pack environment.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1028 |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1029 |
:param graph: Store a graph.
|
1030 |
:param delta: Delta compress contents.
|
|
1031 |
:param keylength: How long should keys be.
|
|
1032 |
"""
|
|
1033 |
def factory(transport): |
|
3735.32.2
by John Arbash Meinel
The 'delta' flag has no effect on the content (all GC is delta'd), |
1034 |
parents = graph |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1035 |
ref_length = 0 |
1036 |
if graph: |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1037 |
ref_length = 1 |
0.17.7
by Robert Collins
Update for current index2 changes. |
1038 |
graph_index = BTreeBuilder(reference_lists=ref_length, |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1039 |
key_elements=keylength) |
1040 |
stream = transport.open_write_stream('newpack') |
|
1041 |
writer = pack.ContainerWriter(stream.write) |
|
1042 |
writer.begin() |
|
1043 |
index = _GCGraphIndex(graph_index, lambda:True, parents=parents, |
|
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1044 |
add_callback=graph_index.add_nodes, |
1045 |
inconsistency_fatal=inconsistency_fatal) |
|
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1046 |
access = knit._DirectPackAccess({}) |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1047 |
access.set_writer(writer, graph_index, (transport, 'newpack')) |
0.17.2
by Robert Collins
Core proof of concept working. |
1048 |
result = GroupCompressVersionedFiles(index, access, delta) |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1049 |
result.stream = stream |
1050 |
result.writer = writer |
|
1051 |
return result |
|
1052 |
return factory |
|
1053 |
||
1054 |
||
1055 |
def cleanup_pack_group(versioned_files): |
|
0.17.23
by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe. |
1056 |
versioned_files.writer.end() |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1057 |
versioned_files.stream.close() |
1058 |
||
1059 |
||
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1060 |
class _BatchingBlockFetcher(object): |
1061 |
"""Fetch group compress blocks in batches.
|
|
1062 |
|
|
1063 |
:ivar total_bytes: int of expected number of bytes needed to fetch the
|
|
1064 |
currently pending batch.
|
|
1065 |
"""
|
|
1066 |
||
1067 |
def __init__(self, gcvf, locations): |
|
1068 |
self.gcvf = gcvf |
|
1069 |
self.locations = locations |
|
1070 |
self.keys = [] |
|
1071 |
self.batch_memos = {} |
|
1072 |
self.memos_to_get = [] |
|
1073 |
self.total_bytes = 0 |
|
1074 |
self.last_read_memo = None |
|
1075 |
self.manager = None |
|
1076 |
||
1077 |
def add_key(self, key): |
|
1078 |
"""Add another to key to fetch.
|
|
1079 |
|
|
1080 |
:return: The estimated number of bytes needed to fetch the batch so
|
|
1081 |
far.
|
|
1082 |
"""
|
|
1083 |
self.keys.append(key) |
|
1084 |
index_memo, _, _, _ = self.locations[key] |
|
1085 |
read_memo = index_memo[0:3] |
|
1086 |
# Three possibilities for this read_memo:
|
|
1087 |
# - it's already part of this batch; or
|
|
1088 |
# - it's not yet part of this batch, but is already cached; or
|
|
1089 |
# - it's not yet part of this batch and will need to be fetched.
|
|
1090 |
if read_memo in self.batch_memos: |
|
1091 |
# This read memo is already in this batch.
|
|
1092 |
return self.total_bytes |
|
1093 |
try: |
|
1094 |
cached_block = self.gcvf._group_cache[read_memo] |
|
1095 |
except KeyError: |
|
1096 |
# This read memo is new to this batch, and the data isn't cached
|
|
1097 |
# either.
|
|
1098 |
self.batch_memos[read_memo] = None |
|
1099 |
self.memos_to_get.append(read_memo) |
|
1100 |
byte_length = read_memo[2] |
|
1101 |
self.total_bytes += byte_length |
|
1102 |
else: |
|
1103 |
# This read memo is new to this batch, but cached.
|
|
1104 |
# Keep a reference to the cached block in batch_memos because it's
|
|
1105 |
# certain that we'll use it when this batch is processed, but
|
|
1106 |
# there's a risk that it would fall out of _group_cache between now
|
|
1107 |
# and then.
|
|
1108 |
self.batch_memos[read_memo] = cached_block |
|
1109 |
return self.total_bytes |
|
1110 |
||
1111 |
def _flush_manager(self): |
|
1112 |
if self.manager is not None: |
|
1113 |
for factory in self.manager.get_record_stream(): |
|
1114 |
yield factory |
|
1115 |
self.manager = None |
|
1116 |
self.last_read_memo = None |
|
1117 |
||
1118 |
def yield_factories(self, full_flush=False): |
|
1119 |
"""Yield factories for keys added since the last yield. They will be
|
|
1120 |
returned in the order they were added via add_key.
|
|
1121 |
|
|
1122 |
:param full_flush: by default, some results may not be returned in case
|
|
1123 |
they can be part of the next batch. If full_flush is True, then
|
|
1124 |
all results are returned.
|
|
1125 |
"""
|
|
1126 |
if self.manager is None and not self.keys: |
|
1127 |
return
|
|
1128 |
# Fetch all memos in this batch.
|
|
1129 |
blocks = self.gcvf._get_blocks(self.memos_to_get) |
|
1130 |
# Turn blocks into factories and yield them.
|
|
1131 |
memos_to_get_stack = list(self.memos_to_get) |
|
1132 |
memos_to_get_stack.reverse() |
|
1133 |
for key in self.keys: |
|
1134 |
index_memo, _, parents, _ = self.locations[key] |
|
1135 |
read_memo = index_memo[:3] |
|
1136 |
if self.last_read_memo != read_memo: |
|
1137 |
# We are starting a new block. If we have a
|
|
1138 |
# manager, we have found everything that fits for
|
|
1139 |
# now, so yield records
|
|
1140 |
for factory in self._flush_manager(): |
|
1141 |
yield factory |
|
1142 |
# Now start a new manager.
|
|
1143 |
if memos_to_get_stack and memos_to_get_stack[-1] == read_memo: |
|
1144 |
# The next block from _get_blocks will be the block we
|
|
1145 |
# need.
|
|
1146 |
block_read_memo, block = blocks.next() |
|
1147 |
if block_read_memo != read_memo: |
|
1148 |
raise AssertionError( |
|
1149 |
"block_read_memo out of sync with read_memo"
|
|
1150 |
"(%r != %r)" % (block_read_memo, read_memo)) |
|
1151 |
self.batch_memos[read_memo] = block |
|
1152 |
memos_to_get_stack.pop() |
|
1153 |
else: |
|
1154 |
block = self.batch_memos[read_memo] |
|
1155 |
self.manager = _LazyGroupContentManager(block) |
|
1156 |
self.last_read_memo = read_memo |
|
1157 |
start, end = index_memo[3:5] |
|
1158 |
self.manager.add_factory(key, parents, start, end) |
|
1159 |
if full_flush: |
|
1160 |
for factory in self._flush_manager(): |
|
1161 |
yield factory |
|
1162 |
del self.keys[:] |
|
1163 |
self.batch_memos.clear() |
|
1164 |
del self.memos_to_get[:] |
|
1165 |
self.total_bytes = 0 |
|
1166 |
||
1167 |
||
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1168 |
class GroupCompressVersionedFiles(VersionedFiles): |
1169 |
"""A group-compress based VersionedFiles implementation."""
|
|
1170 |
||
4634.35.10
by Andrew Bennetts
Move tests to per_repository_chk. |
1171 |
def __init__(self, index, access, delta=True, _unadded_refs=None): |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1172 |
"""Create a GroupCompressVersionedFiles object.
|
1173 |
||
1174 |
:param index: The index object storing access and graph data.
|
|
1175 |
:param access: The access object storing raw data.
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
1176 |
:param delta: Whether to delta compress or just entropy compress.
|
4634.35.10
by Andrew Bennetts
Move tests to per_repository_chk. |
1177 |
:param _unadded_refs: private parameter, don't use.
|
0.17.2
by Robert Collins
Core proof of concept working. |
1178 |
"""
|
1179 |
self._index = index |
|
1180 |
self._access = access |
|
1181 |
self._delta = delta |
|
4634.35.10
by Andrew Bennetts
Move tests to per_repository_chk. |
1182 |
if _unadded_refs is None: |
1183 |
_unadded_refs = {} |
|
1184 |
self._unadded_refs = _unadded_refs |
|
0.17.24
by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group. |
1185 |
self._group_cache = LRUSizeCache(max_size=50*1024*1024) |
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1186 |
self._fallback_vfs = [] |
0.17.2
by Robert Collins
Core proof of concept working. |
1187 |
|
4634.35.1
by Andrew Bennetts
Check for all necessary chk nodes, not just roots. |
1188 |
def without_fallbacks(self): |
4634.35.10
by Andrew Bennetts
Move tests to per_repository_chk. |
1189 |
"""Return a clone of this object without any fallbacks configured."""
|
1190 |
return GroupCompressVersionedFiles(self._index, self._access, |
|
1191 |
self._delta, _unadded_refs=dict(self._unadded_refs)) |
|
4634.35.1
by Andrew Bennetts
Check for all necessary chk nodes, not just roots. |
1192 |
|
0.17.2
by Robert Collins
Core proof of concept working. |
1193 |
def add_lines(self, key, parents, lines, parent_texts=None, |
1194 |
left_matching_blocks=None, nostore_sha=None, random_id=False, |
|
1195 |
check_content=True): |
|
1196 |
"""Add a text to the store.
|
|
1197 |
||
1198 |
:param key: The key tuple of the text to add.
|
|
1199 |
:param parents: The parents key tuples of the text to add.
|
|
1200 |
:param lines: A list of lines. Each line must be a bytestring. And all
|
|
1201 |
of them except the last must be terminated with \n and contain no
|
|
1202 |
other \n's. The last line may either contain no \n's or a single
|
|
1203 |
terminating \n. If the lines list does meet this constraint the add
|
|
1204 |
routine may error or may succeed - but you will be unable to read
|
|
1205 |
the data back accurately. (Checking the lines have been split
|
|
1206 |
correctly is expensive and extremely unlikely to catch bugs so it
|
|
1207 |
is not done at runtime unless check_content is True.)
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1208 |
:param parent_texts: An optional dictionary containing the opaque
|
0.17.2
by Robert Collins
Core proof of concept working. |
1209 |
representations of some or all of the parents of version_id to
|
1210 |
allow delta optimisations. VERY IMPORTANT: the texts must be those
|
|
1211 |
returned by add_lines or data corruption can be caused.
|
|
1212 |
:param left_matching_blocks: a hint about which areas are common
|
|
1213 |
between the text and its left-hand-parent. The format is
|
|
1214 |
the SequenceMatcher.get_matching_blocks format.
|
|
1215 |
:param nostore_sha: Raise ExistingContent and do not add the lines to
|
|
1216 |
the versioned file if the digest of the lines matches this.
|
|
1217 |
:param random_id: If True a random id has been selected rather than
|
|
1218 |
an id determined by some deterministic process such as a converter
|
|
1219 |
from a foreign VCS. When True the backend may choose not to check
|
|
1220 |
for uniqueness of the resulting key within the versioned file, so
|
|
1221 |
this should only be done when the result is expected to be unique
|
|
1222 |
anyway.
|
|
1223 |
:param check_content: If True, the lines supplied are verified to be
|
|
1224 |
bytestrings that are correctly formed lines.
|
|
1225 |
:return: The text sha1, the number of bytes in the text, and an opaque
|
|
1226 |
representation of the inserted version which can be provided
|
|
1227 |
back to future add_lines calls in the parent_texts dictionary.
|
|
1228 |
"""
|
|
1229 |
self._index._check_write_ok() |
|
1230 |
self._check_add(key, lines, random_id, check_content) |
|
1231 |
if parents is None: |
|
1232 |
# The caller might pass None if there is no graph data, but kndx
|
|
1233 |
# indexes can't directly store that, so we give them
|
|
1234 |
# an empty tuple instead.
|
|
1235 |
parents = () |
|
1236 |
# double handling for now. Make it work until then.
|
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
1237 |
length = sum(map(len, lines)) |
1238 |
record = ChunkedContentFactory(key, parents, None, lines) |
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
1239 |
sha1 = list(self._insert_record_stream([record], random_id=random_id, |
1240 |
nostore_sha=nostore_sha))[0] |
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
1241 |
return sha1, length, None |
0.17.2
by Robert Collins
Core proof of concept working. |
1242 |
|
4398.8.6
by John Arbash Meinel
Switch the api from VF.add_text to VF._add_text and trim some extra 'features'. |
1243 |
def _add_text(self, key, parents, text, nostore_sha=None, random_id=False): |
4398.9.1
by Matt Nordhoff
Update _add_text docstrings that still referred to add_text. |
1244 |
"""See VersionedFiles._add_text()."""
|
4398.8.4
by John Arbash Meinel
Implement add_text for GroupCompressVersionedFiles |
1245 |
self._index._check_write_ok() |
1246 |
self._check_add(key, None, random_id, check_content=False) |
|
1247 |
if text.__class__ is not str: |
|
1248 |
raise errors.BzrBadParameterUnicode("text") |
|
1249 |
if parents is None: |
|
1250 |
# The caller might pass None if there is no graph data, but kndx
|
|
1251 |
# indexes can't directly store that, so we give them
|
|
1252 |
# an empty tuple instead.
|
|
1253 |
parents = () |
|
1254 |
# double handling for now. Make it work until then.
|
|
1255 |
length = len(text) |
|
1256 |
record = FulltextContentFactory(key, parents, None, text) |
|
1257 |
sha1 = list(self._insert_record_stream([record], random_id=random_id, |
|
1258 |
nostore_sha=nostore_sha))[0] |
|
1259 |
return sha1, length, None |
|
1260 |
||
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1261 |
def add_fallback_versioned_files(self, a_versioned_files): |
1262 |
"""Add a source of texts for texts not present in this knit.
|
|
1263 |
||
1264 |
:param a_versioned_files: A VersionedFiles object.
|
|
1265 |
"""
|
|
1266 |
self._fallback_vfs.append(a_versioned_files) |
|
1267 |
||
0.17.4
by Robert Collins
Annotate. |
1268 |
def annotate(self, key): |
1269 |
"""See VersionedFiles.annotate."""
|
|
4454.3.58
by John Arbash Meinel
Enable the new annotator for gc format repos. |
1270 |
ann = annotate.Annotator(self) |
1271 |
return ann.annotate_flat(key) |
|
0.17.4
by Robert Collins
Annotate. |
1272 |
|
4454.3.65
by John Arbash Meinel
Tests that VF implementations support .get_annotator() |
1273 |
def get_annotator(self): |
1274 |
return annotate.Annotator(self) |
|
1275 |
||
4332.3.28
by Robert Collins
Start checking file texts in a single pass. |
1276 |
def check(self, progress_bar=None, keys=None): |
0.17.5
by Robert Collins
nograph tests completely passing. |
1277 |
"""See VersionedFiles.check()."""
|
4332.3.28
by Robert Collins
Start checking file texts in a single pass. |
1278 |
if keys is None: |
1279 |
keys = self.keys() |
|
1280 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
1281 |
record.get_bytes_as('fulltext') |
|
1282 |
else: |
|
1283 |
return self.get_record_stream(keys, 'unordered', True) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1284 |
|
0.17.2
by Robert Collins
Core proof of concept working. |
1285 |
def _check_add(self, key, lines, random_id, check_content): |
1286 |
"""check that version_id and lines are safe to add."""
|
|
1287 |
version_id = key[-1] |
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
1288 |
if version_id is not None: |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1289 |
if osutils.contains_whitespace(version_id): |
3735.31.1
by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch. |
1290 |
raise errors.InvalidRevisionId(version_id, self) |
0.17.2
by Robert Collins
Core proof of concept working. |
1291 |
self.check_not_reserved_id(version_id) |
1292 |
# TODO: If random_id==False and the key is already present, we should
|
|
1293 |
# probably check that the existing content is identical to what is
|
|
1294 |
# being inserted, and otherwise raise an exception. This would make
|
|
1295 |
# the bundle code simpler.
|
|
1296 |
if check_content: |
|
1297 |
self._check_lines_not_unicode(lines) |
|
1298 |
self._check_lines_are_lines(lines) |
|
1299 |
||
4593.5.20
by John Arbash Meinel
Expose KnownGraph off of VersionedFiles |
1300 |
def get_known_graph_ancestry(self, keys): |
1301 |
"""Get a KnownGraph instance with the ancestry of keys."""
|
|
4634.11.2
by John Arbash Meinel
Teach VF.get_known_graph_ancestry to go to fallbacks (bug #419241) |
1302 |
# Note that this is identical to
|
1303 |
# KnitVersionedFiles.get_known_graph_ancestry, but they don't share
|
|
1304 |
# ancestry.
|
|
4634.11.3
by John Arbash Meinel
Implement _GCGraphIndex.find_ancestry() |
1305 |
parent_map, missing_keys = self._index.find_ancestry(keys) |
4634.11.2
by John Arbash Meinel
Teach VF.get_known_graph_ancestry to go to fallbacks (bug #419241) |
1306 |
for fallback in self._fallback_vfs: |
1307 |
if not missing_keys: |
|
1308 |
break
|
|
4634.11.3
by John Arbash Meinel
Implement _GCGraphIndex.find_ancestry() |
1309 |
(f_parent_map, f_missing_keys) = fallback._index.find_ancestry( |
1310 |
missing_keys) |
|
4634.11.2
by John Arbash Meinel
Teach VF.get_known_graph_ancestry to go to fallbacks (bug #419241) |
1311 |
parent_map.update(f_parent_map) |
1312 |
missing_keys = f_missing_keys |
|
4593.5.20
by John Arbash Meinel
Expose KnownGraph off of VersionedFiles |
1313 |
kg = _mod_graph.KnownGraph(parent_map) |
1314 |
return kg |
|
1315 |
||
0.17.5
by Robert Collins
nograph tests completely passing. |
1316 |
def get_parent_map(self, keys): |
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1317 |
"""Get a map of the graph parents of keys.
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1318 |
|
1319 |
:param keys: The keys to look up parents for.
|
|
1320 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
1321 |
the mapping.
|
|
1322 |
"""
|
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1323 |
return self._get_parent_map_with_sources(keys)[0] |
1324 |
||
1325 |
def _get_parent_map_with_sources(self, keys): |
|
1326 |
"""Get a map of the parents of keys.
|
|
1327 |
||
1328 |
:param keys: The keys to look up parents for.
|
|
1329 |
:return: A tuple. The first element is a mapping from keys to parents.
|
|
1330 |
Absent keys are absent from the mapping. The second element is a
|
|
1331 |
list with the locations each key was found in. The first element
|
|
1332 |
is the in-this-knit parents, the second the first fallback source,
|
|
1333 |
and so on.
|
|
1334 |
"""
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1335 |
result = {} |
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1336 |
sources = [self._index] + self._fallback_vfs |
0.17.5
by Robert Collins
nograph tests completely passing. |
1337 |
source_results = [] |
1338 |
missing = set(keys) |
|
1339 |
for source in sources: |
|
1340 |
if not missing: |
|
1341 |
break
|
|
1342 |
new_result = source.get_parent_map(missing) |
|
1343 |
source_results.append(new_result) |
|
1344 |
result.update(new_result) |
|
1345 |
missing.difference_update(set(new_result)) |
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1346 |
return result, source_results |
0.17.5
by Robert Collins
nograph tests completely passing. |
1347 |
|
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1348 |
def _get_blocks(self, read_memos): |
1349 |
"""Get GroupCompressBlocks for the given read_memos.
|
|
1350 |
||
1351 |
:returns: a series of (read_memo, block) pairs, in the order they were
|
|
1352 |
originally passed.
|
|
1353 |
"""
|
|
1354 |
cached = {} |
|
1355 |
for read_memo in read_memos: |
|
1356 |
try: |
|
1357 |
block = self._group_cache[read_memo] |
|
1358 |
except KeyError: |
|
1359 |
pass
|
|
1360 |
else: |
|
1361 |
cached[read_memo] = block |
|
1362 |
not_cached = [] |
|
1363 |
not_cached_seen = set() |
|
1364 |
for read_memo in read_memos: |
|
1365 |
if read_memo in cached: |
|
1366 |
# Don't fetch what we already have
|
|
1367 |
continue
|
|
1368 |
if read_memo in not_cached_seen: |
|
1369 |
# Don't try to fetch the same data twice
|
|
1370 |
continue
|
|
1371 |
not_cached.append(read_memo) |
|
1372 |
not_cached_seen.add(read_memo) |
|
1373 |
raw_records = self._access.get_raw_records(not_cached) |
|
1374 |
for read_memo in read_memos: |
|
1375 |
try: |
|
1376 |
yield read_memo, cached[read_memo] |
|
1377 |
except KeyError: |
|
1378 |
# Read the block, and cache it.
|
|
1379 |
zdata = raw_records.next() |
|
1380 |
block = GroupCompressBlock.from_bytes(zdata) |
|
1381 |
self._group_cache[read_memo] = block |
|
1382 |
cached[read_memo] = block |
|
1383 |
yield read_memo, block |
|
0.20.14
by John Arbash Meinel
Factor out _get_group_and_delta_lines. |
1384 |
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
1385 |
def get_missing_compression_parent_keys(self): |
1386 |
"""Return the keys of missing compression parents.
|
|
1387 |
||
1388 |
Missing compression parents occur when a record stream was missing
|
|
1389 |
basis texts, or a index was scanned that had missing basis texts.
|
|
1390 |
"""
|
|
1391 |
# GroupCompress cannot currently reference texts that are not in the
|
|
1392 |
# group, so this is valid for now
|
|
1393 |
return frozenset() |
|
1394 |
||
0.17.5
by Robert Collins
nograph tests completely passing. |
1395 |
def get_record_stream(self, keys, ordering, include_delta_closure): |
1396 |
"""Get a stream of records for keys.
|
|
1397 |
||
1398 |
:param keys: The keys to include.
|
|
1399 |
:param ordering: Either 'unordered' or 'topological'. A topologically
|
|
1400 |
sorted stream has compression parents strictly before their
|
|
1401 |
children.
|
|
1402 |
:param include_delta_closure: If True then the closure across any
|
|
1403 |
compression parents will be included (in the opaque data).
|
|
1404 |
:return: An iterator of ContentFactory objects, each of which is only
|
|
1405 |
valid until the iterator is advanced.
|
|
1406 |
"""
|
|
1407 |
# keys might be a generator
|
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
1408 |
orig_keys = list(keys) |
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1409 |
keys = set(keys) |
0.17.5
by Robert Collins
nograph tests completely passing. |
1410 |
if not keys: |
1411 |
return
|
|
0.20.23
by John Arbash Meinel
Add a progress indicator for chk pages. |
1412 |
if (not self._index.has_graph |
3735.31.14
by John Arbash Meinel
Change the gc-optimal to 'groupcompress' |
1413 |
and ordering in ('topological', 'groupcompress')): |
0.17.5
by Robert Collins
nograph tests completely passing. |
1414 |
# Cannot topological order when no graph has been stored.
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1415 |
# but we allow 'as-requested' or 'unordered'
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1416 |
ordering = 'unordered' |
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1417 |
|
1418 |
remaining_keys = keys |
|
1419 |
while True: |
|
1420 |
try: |
|
1421 |
keys = set(remaining_keys) |
|
1422 |
for content_factory in self._get_remaining_record_stream(keys, |
|
1423 |
orig_keys, ordering, include_delta_closure): |
|
1424 |
remaining_keys.discard(content_factory.key) |
|
1425 |
yield content_factory |
|
1426 |
return
|
|
1427 |
except errors.RetryWithNewPacks, e: |
|
1428 |
self._access.reload_or_raise(e) |
|
1429 |
||
1430 |
def _find_from_fallback(self, missing): |
|
1431 |
"""Find whatever keys you can from the fallbacks.
|
|
1432 |
||
1433 |
:param missing: A set of missing keys. This set will be mutated as keys
|
|
1434 |
are found from a fallback_vfs
|
|
1435 |
:return: (parent_map, key_to_source_map, source_results)
|
|
1436 |
parent_map the overall key => parent_keys
|
|
1437 |
key_to_source_map a dict from {key: source}
|
|
1438 |
source_results a list of (source: keys)
|
|
1439 |
"""
|
|
1440 |
parent_map = {} |
|
1441 |
key_to_source_map = {} |
|
1442 |
source_results = [] |
|
1443 |
for source in self._fallback_vfs: |
|
1444 |
if not missing: |
|
1445 |
break
|
|
1446 |
source_parents = source.get_parent_map(missing) |
|
1447 |
parent_map.update(source_parents) |
|
1448 |
source_parents = list(source_parents) |
|
1449 |
source_results.append((source, source_parents)) |
|
1450 |
key_to_source_map.update((key, source) for key in source_parents) |
|
1451 |
missing.difference_update(source_parents) |
|
1452 |
return parent_map, key_to_source_map, source_results |
|
1453 |
||
1454 |
def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map): |
|
1455 |
"""Get the (source, [keys]) list.
|
|
1456 |
||
1457 |
The returned objects should be in the order defined by 'ordering',
|
|
1458 |
which can weave between different sources.
|
|
1459 |
:param ordering: Must be one of 'topological' or 'groupcompress'
|
|
1460 |
:return: List of [(source, [keys])] tuples, such that all keys are in
|
|
1461 |
the defined order, regardless of source.
|
|
1462 |
"""
|
|
1463 |
if ordering == 'topological': |
|
1464 |
present_keys = topo_sort(parent_map) |
|
1465 |
else: |
|
1466 |
# ordering == 'groupcompress'
|
|
1467 |
# XXX: This only optimizes for the target ordering. We may need
|
|
1468 |
# to balance that with the time it takes to extract
|
|
1469 |
# ordering, by somehow grouping based on
|
|
1470 |
# locations[key][0:3]
|
|
1471 |
present_keys = sort_gc_optimal(parent_map) |
|
1472 |
# Now group by source:
|
|
1473 |
source_keys = [] |
|
1474 |
current_source = None |
|
1475 |
for key in present_keys: |
|
1476 |
source = key_to_source_map.get(key, self) |
|
1477 |
if source is not current_source: |
|
1478 |
source_keys.append((source, [])) |
|
3735.32.12
by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types. |
1479 |
current_source = source |
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1480 |
source_keys[-1][1].append(key) |
1481 |
return source_keys |
|
1482 |
||
1483 |
def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys, |
|
1484 |
key_to_source_map): |
|
1485 |
source_keys = [] |
|
1486 |
current_source = None |
|
1487 |
for key in orig_keys: |
|
1488 |
if key in locations or key in unadded_keys: |
|
1489 |
source = self |
|
1490 |
elif key in key_to_source_map: |
|
1491 |
source = key_to_source_map[key] |
|
1492 |
else: # absent |
|
1493 |
continue
|
|
1494 |
if source is not current_source: |
|
1495 |
source_keys.append((source, [])) |
|
3735.32.12
by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types. |
1496 |
current_source = source |
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1497 |
source_keys[-1][1].append(key) |
1498 |
return source_keys |
|
1499 |
||
1500 |
def _get_io_ordered_source_keys(self, locations, unadded_keys, |
|
1501 |
source_result): |
|
1502 |
def get_group(key): |
|
1503 |
# This is the group the bytes are stored in, followed by the
|
|
1504 |
# location in the group
|
|
1505 |
return locations[key][0] |
|
1506 |
present_keys = sorted(locations.iterkeys(), key=get_group) |
|
1507 |
# We don't have an ordering for keys in the in-memory object, but
|
|
1508 |
# lets process the in-memory ones first.
|
|
1509 |
present_keys = list(unadded_keys) + present_keys |
|
1510 |
# Now grab all of the ones from other sources
|
|
1511 |
source_keys = [(self, present_keys)] |
|
1512 |
source_keys.extend(source_result) |
|
1513 |
return source_keys |
|
1514 |
||
1515 |
def _get_remaining_record_stream(self, keys, orig_keys, ordering, |
|
1516 |
include_delta_closure): |
|
1517 |
"""Get a stream of records for keys.
|
|
1518 |
||
1519 |
:param keys: The keys to include.
|
|
1520 |
:param ordering: one of 'unordered', 'topological', 'groupcompress' or
|
|
1521 |
'as-requested'
|
|
1522 |
:param include_delta_closure: If True then the closure across any
|
|
1523 |
compression parents will be included (in the opaque data).
|
|
1524 |
:return: An iterator of ContentFactory objects, each of which is only
|
|
1525 |
valid until the iterator is advanced.
|
|
1526 |
"""
|
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1527 |
# Cheap: iterate
|
1528 |
locations = self._index.get_build_details(keys) |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1529 |
unadded_keys = set(self._unadded_refs).intersection(keys) |
1530 |
missing = keys.difference(locations) |
|
1531 |
missing.difference_update(unadded_keys) |
|
1532 |
(fallback_parent_map, key_to_source_map, |
|
1533 |
source_result) = self._find_from_fallback(missing) |
|
1534 |
if ordering in ('topological', 'groupcompress'): |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1535 |
# would be better to not globally sort initially but instead
|
1536 |
# start with one key, recurse to its oldest parent, then grab
|
|
1537 |
# everything in the same group, etc.
|
|
1538 |
parent_map = dict((key, details[2]) for key, details in |
|
1539 |
locations.iteritems()) |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1540 |
for key in unadded_keys: |
1541 |
parent_map[key] = self._unadded_refs[key] |
|
1542 |
parent_map.update(fallback_parent_map) |
|
1543 |
source_keys = self._get_ordered_source_keys(ordering, parent_map, |
|
1544 |
key_to_source_map) |
|
0.22.6
by John Arbash Meinel
Clustering chk pages properly makes a big difference. |
1545 |
elif ordering == 'as-requested': |
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1546 |
source_keys = self._get_as_requested_source_keys(orig_keys, |
1547 |
locations, unadded_keys, key_to_source_map) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1548 |
else: |
0.20.10
by John Arbash Meinel
Change the extraction ordering for 'unordered'. |
1549 |
# We want to yield the keys in a semi-optimal (read-wise) ordering.
|
1550 |
# Otherwise we thrash the _group_cache and destroy performance
|
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1551 |
source_keys = self._get_io_ordered_source_keys(locations, |
1552 |
unadded_keys, source_result) |
|
1553 |
for key in missing: |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1554 |
yield AbsentContentFactory(key) |
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1555 |
# Batch up as many keys as we can until either:
|
1556 |
# - we encounter an unadded ref, or
|
|
1557 |
# - we run out of keys, or
|
|
1558 |
# - the total bytes to retrieve for this batch > BATCH_SIZE
|
|
1559 |
batcher = _BatchingBlockFetcher(self, locations) |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1560 |
for source, keys in source_keys: |
1561 |
if source is self: |
|
1562 |
for key in keys: |
|
1563 |
if key in self._unadded_refs: |
|
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1564 |
# Flush batch, then yield unadded ref from
|
1565 |
# self._compressor.
|
|
1566 |
for factory in batcher.yield_factories(full_flush=True): |
|
1567 |
yield factory |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1568 |
bytes, sha1 = self._compressor.extract(key) |
1569 |
parents = self._unadded_refs[key] |
|
3735.32.12
by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types. |
1570 |
yield FulltextContentFactory(key, parents, sha1, bytes) |
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1571 |
continue
|
1572 |
if batcher.add_key(key) > BATCH_SIZE: |
|
1573 |
# Ok, this batch is big enough. Yield some results.
|
|
1574 |
for factory in batcher.yield_factories(): |
|
1575 |
yield factory |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
1576 |
else: |
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1577 |
for factory in batcher.yield_factories(full_flush=True): |
1578 |
yield factory |
|
3735.31.18
by John Arbash Meinel
Implement stacking support across all ordering implementations. |
1579 |
for record in source.get_record_stream(keys, ordering, |
1580 |
include_delta_closure): |
|
1581 |
yield record |
|
4634.8.1
by Andrew Bennetts
Cherry-pick fix for bug #402657 from gc-batching. |
1582 |
for factory in batcher.yield_factories(full_flush=True): |
1583 |
yield factory |
|
0.20.5
by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks. |
1584 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1585 |
def get_sha1s(self, keys): |
1586 |
"""See VersionedFiles.get_sha1s()."""
|
|
1587 |
result = {} |
|
1588 |
for record in self.get_record_stream(keys, 'unordered', True): |
|
1589 |
if record.sha1 != None: |
|
1590 |
result[record.key] = record.sha1 |
|
1591 |
else: |
|
1592 |
if record.storage_kind != 'absent': |
|
3735.40.2
by John Arbash Meinel
Add a groupcompress.encode_copy_instruction function. |
1593 |
result[record.key] = osutils.sha_string( |
1594 |
record.get_bytes_as('fulltext')) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1595 |
return result |
1596 |
||
0.17.2
by Robert Collins
Core proof of concept working. |
1597 |
def insert_record_stream(self, stream): |
1598 |
"""Insert a record stream into this container.
|
|
1599 |
||
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1600 |
:param stream: A stream of records to insert.
|
0.17.2
by Robert Collins
Core proof of concept working. |
1601 |
:return: None
|
1602 |
:seealso VersionedFiles.get_record_stream:
|
|
1603 |
"""
|
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1604 |
# XXX: Setting random_id=True makes
|
1605 |
# test_insert_record_stream_existing_keys fail for groupcompress and
|
|
1606 |
# groupcompress-nograph, this needs to be revisited while addressing
|
|
1607 |
# 'bzr branch' performance issues.
|
|
1608 |
for _ in self._insert_record_stream(stream, random_id=False): |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1609 |
pass
|
0.17.2
by Robert Collins
Core proof of concept working. |
1610 |
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1611 |
def _insert_record_stream(self, stream, random_id=False, nostore_sha=None, |
1612 |
reuse_blocks=True): |
|
0.17.2
by Robert Collins
Core proof of concept working. |
1613 |
"""Internal core to insert a record stream into this container.
|
1614 |
||
1615 |
This helper function has a different interface than insert_record_stream
|
|
1616 |
to allow add_lines to be minimal, but still return the needed data.
|
|
1617 |
||
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1618 |
:param stream: A stream of records to insert.
|
3735.31.12
by John Arbash Meinel
Push nostore_sha down through the stack. |
1619 |
:param nostore_sha: If the sha1 of a given text matches nostore_sha,
|
1620 |
raise ExistingContent, rather than committing the new text.
|
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1621 |
:param reuse_blocks: If the source is streaming from
|
1622 |
groupcompress-blocks, just insert the blocks as-is, rather than
|
|
1623 |
expanding the texts and inserting again.
|
|
0.17.2
by Robert Collins
Core proof of concept working. |
1624 |
:return: An iterator over the sha1 of the inserted records.
|
1625 |
:seealso insert_record_stream:
|
|
1626 |
:seealso add_lines:
|
|
1627 |
"""
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1628 |
adapters = {} |
0.17.5
by Robert Collins
nograph tests completely passing. |
1629 |
def get_adapter(adapter_key): |
1630 |
try: |
|
1631 |
return adapters[adapter_key] |
|
1632 |
except KeyError: |
|
1633 |
adapter_factory = adapter_registry.get(adapter_key) |
|
1634 |
adapter = adapter_factory(self) |
|
1635 |
adapters[adapter_key] = adapter |
|
1636 |
return adapter |
|
0.17.2
by Robert Collins
Core proof of concept working. |
1637 |
# This will go up to fulltexts for gc to gc fetching, which isn't
|
1638 |
# ideal.
|
|
3735.32.19
by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway. |
1639 |
self._compressor = GroupCompressor() |
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
1640 |
self._unadded_refs = {} |
0.17.5
by Robert Collins
nograph tests completely passing. |
1641 |
keys_to_add = [] |
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
1642 |
def flush(): |
3735.32.23
by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block |
1643 |
bytes = self._compressor.flush().to_bytes() |
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
1644 |
index, start, length = self._access.add_raw_records( |
0.25.7
by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content. |
1645 |
[(None, len(bytes))], bytes)[0] |
0.17.6
by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big). |
1646 |
nodes = [] |
1647 |
for key, reads, refs in keys_to_add: |
|
1648 |
nodes.append((key, "%d %d %s" % (start, length, reads), refs)) |
|
1649 |
self._index.add_records(nodes, random_id=random_id) |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1650 |
self._unadded_refs = {} |
1651 |
del keys_to_add[:] |
|
3735.32.19
by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway. |
1652 |
self._compressor = GroupCompressor() |
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1653 |
|
0.20.15
by John Arbash Meinel
Change so that regions that have lots of copies get converted back |
1654 |
last_prefix = None |
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1655 |
max_fulltext_len = 0 |
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1656 |
max_fulltext_prefix = None |
3735.32.20
by John Arbash Meinel
groupcompress now copies the blocks exactly as they were given. |
1657 |
insert_manager = None |
1658 |
block_start = None |
|
1659 |
block_length = None |
|
3735.36.15
by John Arbash Meinel
Set 'combine_backing_indices=False' as the default for text and chk indices. |
1660 |
# XXX: TODO: remove this, it is just for safety checking for now
|
1661 |
inserted_keys = set() |
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
1662 |
reuse_this_block = reuse_blocks |
0.17.2
by Robert Collins
Core proof of concept working. |
1663 |
for record in stream: |
0.17.5
by Robert Collins
nograph tests completely passing. |
1664 |
# Raise an error when a record is missing.
|
1665 |
if record.storage_kind == 'absent': |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1666 |
raise errors.RevisionNotPresent(record.key, self) |
3735.36.15
by John Arbash Meinel
Set 'combine_backing_indices=False' as the default for text and chk indices. |
1667 |
if random_id: |
1668 |
if record.key in inserted_keys: |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1669 |
trace.note('Insert claimed random_id=True,' |
1670 |
' but then inserted %r two times', record.key) |
|
3735.36.15
by John Arbash Meinel
Set 'combine_backing_indices=False' as the default for text and chk indices. |
1671 |
continue
|
1672 |
inserted_keys.add(record.key) |
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1673 |
if reuse_blocks: |
1674 |
# If the reuse_blocks flag is set, check to see if we can just
|
|
1675 |
# copy a groupcompress block as-is.
|
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
1676 |
# We only check on the first record (groupcompress-block) not
|
1677 |
# on all of the (groupcompress-block-ref) entries.
|
|
1678 |
# The reuse_this_block flag is then kept for as long as
|
|
1679 |
if record.storage_kind == 'groupcompress-block': |
|
1680 |
# Check to see if we really want to re-use this block
|
|
1681 |
insert_manager = record._manager |
|
1682 |
reuse_this_block = insert_manager.check_is_well_utilized() |
|
1683 |
else: |
|
1684 |
reuse_this_block = False |
|
1685 |
if reuse_this_block: |
|
1686 |
# We still want to reuse this block
|
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1687 |
if record.storage_kind == 'groupcompress-block': |
1688 |
# Insert the raw block into the target repo
|
|
1689 |
insert_manager = record._manager |
|
1690 |
bytes = record._manager._block.to_bytes() |
|
1691 |
_, start, length = self._access.add_raw_records( |
|
1692 |
[(None, len(bytes))], bytes)[0] |
|
1693 |
del bytes |
|
1694 |
block_start = start |
|
1695 |
block_length = length |
|
1696 |
if record.storage_kind in ('groupcompress-block', |
|
1697 |
'groupcompress-block-ref'): |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1698 |
if insert_manager is None: |
1699 |
raise AssertionError('No insert_manager set') |
|
4634.23.1
by Robert Collins
Cherrypick from bzr.dev: Fix bug 402652: recompress badly packed groups during fetch. (John Arbash Meinel, Robert Collins) |
1700 |
if insert_manager is not record._manager: |
1701 |
raise AssertionError('insert_manager does not match' |
|
1702 |
' the current record, we cannot be positive'
|
|
1703 |
' that the appropriate content was inserted.'
|
|
1704 |
)
|
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1705 |
value = "%d %d %d %d" % (block_start, block_length, |
1706 |
record._start, record._end) |
|
1707 |
nodes = [(record.key, value, (record.parents,))] |
|
3735.38.1
by John Arbash Meinel
Change the delta byte stream to remove the 'source length' entry. |
1708 |
# TODO: Consider buffering up many nodes to be added, not
|
1709 |
# sure how much overhead this has, but we're seeing
|
|
1710 |
# ~23s / 120s in add_records calls
|
|
3735.32.21
by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al. |
1711 |
self._index.add_records(nodes, random_id=random_id) |
1712 |
continue
|
|
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
1713 |
try: |
0.23.52
by John Arbash Meinel
Use the max_delta flag. |
1714 |
bytes = record.get_bytes_as('fulltext') |
0.20.18
by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys() |
1715 |
except errors.UnavailableRepresentation: |
0.17.5
by Robert Collins
nograph tests completely passing. |
1716 |
adapter_key = record.storage_kind, 'fulltext' |
1717 |
adapter = get_adapter(adapter_key) |
|
0.20.21
by John Arbash Meinel
Merge the chk sorting code. |
1718 |
bytes = adapter.get_bytes(record) |
0.20.13
by John Arbash Meinel
Play around a bit. |
1719 |
if len(record.key) > 1: |
1720 |
prefix = record.key[0] |
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1721 |
soft = (prefix == last_prefix) |
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1722 |
else: |
1723 |
prefix = None |
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1724 |
soft = False |
1725 |
if max_fulltext_len < len(bytes): |
|
1726 |
max_fulltext_len = len(bytes) |
|
1727 |
max_fulltext_prefix = prefix |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1728 |
(found_sha1, start_point, end_point, |
1729 |
type) = self._compressor.compress(record.key, |
|
1730 |
bytes, record.sha1, soft=soft, |
|
1731 |
nostore_sha=nostore_sha) |
|
1732 |
# delta_ratio = float(len(bytes)) / (end_point - start_point)
|
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1733 |
# Check if we want to continue to include that text
|
0.25.11
by John Arbash Meinel
Slightly different handling of large texts. |
1734 |
if (prefix == max_fulltext_prefix |
1735 |
and end_point < 2 * max_fulltext_len): |
|
1736 |
# As long as we are on the same file_id, we will fill at least
|
|
1737 |
# 2 * max_fulltext_len
|
|
1738 |
start_new_block = False |
|
1739 |
elif end_point > 4*1024*1024: |
|
1740 |
start_new_block = True |
|
1741 |
elif (prefix is not None and prefix != last_prefix |
|
1742 |
and end_point > 2*1024*1024): |
|
1743 |
start_new_block = True |
|
1744 |
else: |
|
1745 |
start_new_block = False |
|
0.25.10
by John Arbash Meinel
Play around with detecting compression breaks. |
1746 |
last_prefix = prefix |
1747 |
if start_new_block: |
|
1748 |
self._compressor.pop_last() |
|
1749 |
flush() |
|
1750 |
max_fulltext_len = len(bytes) |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1751 |
(found_sha1, start_point, end_point, |
1752 |
type) = self._compressor.compress(record.key, bytes, |
|
1753 |
record.sha1) |
|
0.17.26
by Robert Collins
Working better --gc-plain-chk. |
1754 |
if record.key[-1] is None: |
1755 |
key = record.key[:-1] + ('sha1:' + found_sha1,) |
|
1756 |
else: |
|
1757 |
key = record.key |
|
1758 |
self._unadded_refs[key] = record.parents |
|
0.17.3
by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression. |
1759 |
yield found_sha1 |
3735.2.164
by John Arbash Meinel
Fix a critical bug that caused problems with the index entries. |
1760 |
keys_to_add.append((key, '%d %d' % (start_point, end_point), |
0.17.5
by Robert Collins
nograph tests completely passing. |
1761 |
(record.parents,))) |
0.17.8
by Robert Collins
Flush pending updates at the end of _insert_record_stream |
1762 |
if len(keys_to_add): |
1763 |
flush() |
|
0.17.11
by Robert Collins
Add extraction of just-compressed texts to support converting from knits. |
1764 |
self._compressor = None |
0.17.5
by Robert Collins
nograph tests completely passing. |
1765 |
|
1766 |
def iter_lines_added_or_present_in_keys(self, keys, pb=None): |
|
1767 |
"""Iterate over the lines in the versioned files from keys.
|
|
1768 |
||
1769 |
This may return lines from other keys. Each item the returned
|
|
1770 |
iterator yields is a tuple of a line and a text version that that line
|
|
1771 |
is present in (not introduced in).
|
|
1772 |
||
1773 |
Ordering of results is in whatever order is most suitable for the
|
|
1774 |
underlying storage format.
|
|
1775 |
||
1776 |
If a progress bar is supplied, it may be used to indicate progress.
|
|
1777 |
The caller is responsible for cleaning up progress bars (because this
|
|
1778 |
is an iterator).
|
|
1779 |
||
1780 |
NOTES:
|
|
1781 |
* Lines are normalised by the underlying store: they will all have \n
|
|
1782 |
terminators.
|
|
1783 |
* Lines are returned in arbitrary order.
|
|
1784 |
||
1785 |
:return: An iterator over (line, key).
|
|
1786 |
"""
|
|
1787 |
keys = set(keys) |
|
1788 |
total = len(keys) |
|
1789 |
# we don't care about inclusions, the caller cares.
|
|
1790 |
# but we need to setup a list of records to visit.
|
|
1791 |
# we need key, position, length
|
|
1792 |
for key_idx, record in enumerate(self.get_record_stream(keys, |
|
1793 |
'unordered', True)): |
|
1794 |
# XXX: todo - optimise to use less than full texts.
|
|
1795 |
key = record.key |
|
4398.8.8
by John Arbash Meinel
Respond to Andrew's review comments. |
1796 |
if pb is not None: |
1797 |
pb.update('Walking content', key_idx, total) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1798 |
if record.storage_kind == 'absent': |
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1799 |
raise errors.RevisionNotPresent(key, self) |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
1800 |
lines = osutils.split_lines(record.get_bytes_as('fulltext')) |
0.17.5
by Robert Collins
nograph tests completely passing. |
1801 |
for line in lines: |
1802 |
yield line, key |
|
4398.8.8
by John Arbash Meinel
Respond to Andrew's review comments. |
1803 |
if pb is not None: |
1804 |
pb.update('Walking content', total, total) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1805 |
|
1806 |
def keys(self): |
|
1807 |
"""See VersionedFiles.keys."""
|
|
1808 |
if 'evil' in debug.debug_flags: |
|
1809 |
trace.mutter_callsite(2, "keys scales with size of history") |
|
3735.31.7
by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos. |
1810 |
sources = [self._index] + self._fallback_vfs |
0.17.5
by Robert Collins
nograph tests completely passing. |
1811 |
result = set() |
1812 |
for source in sources: |
|
1813 |
result.update(source.keys()) |
|
1814 |
return result |
|
1815 |
||
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1816 |
|
1817 |
class _GCGraphIndex(object): |
|
1818 |
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
|
|
1819 |
||
0.17.9
by Robert Collins
Initial stab at repository format support. |
1820 |
def __init__(self, graph_index, is_locked, parents=True, |
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1821 |
add_callback=None, track_external_parent_refs=False, |
4634.29.1
by Andrew Bennetts
Rough code to reject commit_write_group if any inventory's CHK root is absent. |
1822 |
inconsistency_fatal=True, track_new_keys=False): |
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1823 |
"""Construct a _GCGraphIndex on a graph_index.
|
1824 |
||
1825 |
:param graph_index: An implementation of bzrlib.index.GraphIndex.
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1826 |
:param is_locked: A callback, returns True if the index is locked and
|
1827 |
thus usable.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1828 |
:param parents: If True, record knits parents, if not do not record
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1829 |
parents.
|
1830 |
:param add_callback: If not None, allow additions to the index and call
|
|
1831 |
this callback with a list of added GraphIndex nodes:
|
|
1832 |
[(node, value, node_refs), ...]
|
|
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1833 |
:param track_external_parent_refs: As keys are added, keep track of the
|
1834 |
keys they reference, so that we can query get_missing_parents(),
|
|
1835 |
etc.
|
|
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1836 |
:param inconsistency_fatal: When asked to add records that are already
|
1837 |
present, and the details are inconsistent with the existing
|
|
1838 |
record, raise an exception instead of warning (and skipping the
|
|
1839 |
record).
|
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1840 |
"""
|
1841 |
self._add_callback = add_callback |
|
1842 |
self._graph_index = graph_index |
|
1843 |
self._parents = parents |
|
1844 |
self.has_graph = parents |
|
1845 |
self._is_locked = is_locked |
|
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1846 |
self._inconsistency_fatal = inconsistency_fatal |
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1847 |
if track_external_parent_refs: |
4634.29.6
by Andrew Bennetts
Put new key tracking in _KeyRefs rather than alongside it. |
1848 |
self._key_dependencies = knit._KeyRefs( |
1849 |
track_new_keys=track_new_keys) |
|
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1850 |
else: |
1851 |
self._key_dependencies = None |
|
0.17.1
by Robert Collins
Starting point. Interface tests hooked up and failing. |
1852 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1853 |
def add_records(self, records, random_id=False): |
1854 |
"""Add multiple records to the index.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1855 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1856 |
This function does not insert data into the Immutable GraphIndex
|
1857 |
backing the KnitGraphIndex, instead it prepares data for insertion by
|
|
1858 |
the caller and checks that it is safe to insert then calls
|
|
1859 |
self._add_callback with the prepared GraphIndex nodes.
|
|
1860 |
||
1861 |
:param records: a list of tuples:
|
|
1862 |
(key, options, access_memo, parents).
|
|
1863 |
:param random_id: If True the ids being added were randomly generated
|
|
1864 |
and no check for existence will be performed.
|
|
1865 |
"""
|
|
1866 |
if not self._add_callback: |
|
1867 |
raise errors.ReadOnlyError(self) |
|
1868 |
# we hope there are no repositories with inconsistent parentage
|
|
1869 |
# anymore.
|
|
1870 |
||
1871 |
changed = False |
|
1872 |
keys = {} |
|
1873 |
for (key, value, refs) in records: |
|
1874 |
if not self._parents: |
|
1875 |
if refs: |
|
1876 |
for ref in refs: |
|
1877 |
if ref: |
|
4398.8.1
by John Arbash Meinel
Add a VersionedFile.add_text() api. |
1878 |
raise errors.KnitCorrupt(self, |
0.17.5
by Robert Collins
nograph tests completely passing. |
1879 |
"attempt to add node with parents "
|
1880 |
"in parentless index.") |
|
1881 |
refs = () |
|
1882 |
changed = True |
|
1883 |
keys[key] = (value, refs) |
|
1884 |
# check for dups
|
|
1885 |
if not random_id: |
|
1886 |
present_nodes = self._get_entries(keys) |
|
1887 |
for (index, key, value, node_refs) in present_nodes: |
|
1888 |
if node_refs != keys[key][1]: |
|
4465.2.4
by Aaron Bentley
Switch between warn and raise depending on inconsistent_fatal. |
1889 |
details = '%s %s %s' % (key, (value, node_refs), keys[key]) |
1890 |
if self._inconsistency_fatal: |
|
1891 |
raise errors.KnitCorrupt(self, "inconsistent details" |
|
1892 |
" in add_records: %s" % |
|
1893 |
details) |
|
1894 |
else: |
|
1895 |
trace.warning("inconsistent details in skipped" |
|
1896 |
" record: %s", details) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1897 |
del keys[key] |
1898 |
changed = True |
|
1899 |
if changed: |
|
1900 |
result = [] |
|
1901 |
if self._parents: |
|
1902 |
for key, (value, node_refs) in keys.iteritems(): |
|
1903 |
result.append((key, value, node_refs)) |
|
1904 |
else: |
|
1905 |
for key, (value, node_refs) in keys.iteritems(): |
|
1906 |
result.append((key, value)) |
|
1907 |
records = result |
|
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1908 |
key_dependencies = self._key_dependencies |
4634.29.6
by Andrew Bennetts
Put new key tracking in _KeyRefs rather than alongside it. |
1909 |
if key_dependencies is not None: |
1910 |
if self._parents: |
|
1911 |
for key, value, refs in records: |
|
1912 |
parents = refs[0] |
|
1913 |
key_dependencies.add_references(key, parents) |
|
1914 |
else: |
|
1915 |
for key, value, refs in records: |
|
1916 |
new_keys.add_key(key) |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1917 |
self._add_callback(records) |
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
1918 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1919 |
def _check_read(self): |
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1920 |
"""Raise an exception if reads are not permitted."""
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1921 |
if not self._is_locked(): |
1922 |
raise errors.ObjectNotLocked(self) |
|
1923 |
||
0.17.2
by Robert Collins
Core proof of concept working. |
1924 |
def _check_write_ok(self): |
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1925 |
"""Raise an exception if writes are not permitted."""
|
0.17.2
by Robert Collins
Core proof of concept working. |
1926 |
if not self._is_locked(): |
1927 |
raise errors.ObjectNotLocked(self) |
|
1928 |
||
0.17.5
by Robert Collins
nograph tests completely passing. |
1929 |
def _get_entries(self, keys, check_present=False): |
1930 |
"""Get the entries for keys.
|
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
1931 |
|
1932 |
Note: Callers are responsible for checking that the index is locked
|
|
1933 |
before calling this method.
|
|
1934 |
||
0.17.5
by Robert Collins
nograph tests completely passing. |
1935 |
:param keys: An iterable of index key tuples.
|
1936 |
"""
|
|
1937 |
keys = set(keys) |
|
1938 |
found_keys = set() |
|
1939 |
if self._parents: |
|
1940 |
for node in self._graph_index.iter_entries(keys): |
|
1941 |
yield node |
|
1942 |
found_keys.add(node[1]) |
|
1943 |
else: |
|
1944 |
# adapt parentless index to the rest of the code.
|
|
1945 |
for node in self._graph_index.iter_entries(keys): |
|
1946 |
yield node[0], node[1], node[2], () |
|
1947 |
found_keys.add(node[1]) |
|
1948 |
if check_present: |
|
1949 |
missing_keys = keys.difference(found_keys) |
|
1950 |
if missing_keys: |
|
4398.8.8
by John Arbash Meinel
Respond to Andrew's review comments. |
1951 |
raise errors.RevisionNotPresent(missing_keys.pop(), self) |
0.17.5
by Robert Collins
nograph tests completely passing. |
1952 |
|
4634.11.3
by John Arbash Meinel
Implement _GCGraphIndex.find_ancestry() |
1953 |
def find_ancestry(self, keys): |
1954 |
"""See CombinedGraphIndex.find_ancestry"""
|
|
1955 |
return self._graph_index.find_ancestry(keys, 0) |
|
1956 |
||
0.17.5
by Robert Collins
nograph tests completely passing. |
1957 |
def get_parent_map(self, keys): |
1958 |
"""Get a map of the parents of keys.
|
|
1959 |
||
1960 |
:param keys: The keys to look up parents for.
|
|
1961 |
:return: A mapping from keys to parents. Absent keys are absent from
|
|
1962 |
the mapping.
|
|
1963 |
"""
|
|
1964 |
self._check_read() |
|
1965 |
nodes = self._get_entries(keys) |
|
1966 |
result = {} |
|
1967 |
if self._parents: |
|
1968 |
for node in nodes: |
|
1969 |
result[node[1]] = node[3][0] |
|
1970 |
else: |
|
1971 |
for node in nodes: |
|
1972 |
result[node[1]] = None |
|
1973 |
return result |
|
1974 |
||
4343.3.1
by John Arbash Meinel
Set 'supports_external_lookups=True' for dev6 repositories. |
1975 |
def get_missing_parents(self): |
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1976 |
"""Return the keys of missing parents."""
|
1977 |
# Copied from _KnitGraphIndex.get_missing_parents
|
|
1978 |
# We may have false positives, so filter those out.
|
|
4634.29.6
by Andrew Bennetts
Put new key tracking in _KeyRefs rather than alongside it. |
1979 |
self._key_dependencies.satisfy_refs_for_keys( |
4343.3.21
by John Arbash Meinel
Implement get_missing_parents in terms of _KeyRefs. |
1980 |
self.get_parent_map(self._key_dependencies.get_unsatisfied_refs())) |
1981 |
return frozenset(self._key_dependencies.get_unsatisfied_refs()) |
|
4343.3.1
by John Arbash Meinel
Set 'supports_external_lookups=True' for dev6 repositories. |
1982 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
1983 |
def get_build_details(self, keys): |
1984 |
"""Get the various build details for keys.
|
|
1985 |
||
1986 |
Ghosts are omitted from the result.
|
|
1987 |
||
1988 |
:param keys: An iterable of keys.
|
|
1989 |
:return: A dict of key:
|
|
1990 |
(index_memo, compression_parent, parents, record_details).
|
|
1991 |
index_memo
|
|
1992 |
opaque structure to pass to read_records to extract the raw
|
|
1993 |
data
|
|
1994 |
compression_parent
|
|
1995 |
Content that this record is built upon, may be None
|
|
1996 |
parents
|
|
1997 |
Logical parents of this node
|
|
1998 |
record_details
|
|
1999 |
extra information about the content which needs to be passed to
|
|
2000 |
Factory.parse_record
|
|
2001 |
"""
|
|
2002 |
self._check_read() |
|
2003 |
result = {} |
|
0.20.29
by Ian Clatworthy
groupcompress.py code cleanups |
2004 |
entries = self._get_entries(keys) |
0.17.5
by Robert Collins
nograph tests completely passing. |
2005 |
for entry in entries: |
2006 |
key = entry[1] |
|
2007 |
if not self._parents: |
|
2008 |
parents = None |
|
2009 |
else: |
|
2010 |
parents = entry[3][0] |
|
2011 |
method = 'group' |
|
2012 |
result[key] = (self._node_to_position(entry), |
|
2013 |
None, parents, (method, None)) |
|
2014 |
return result |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
2015 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
2016 |
def keys(self): |
2017 |
"""Get all the keys in the collection.
|
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
2018 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
2019 |
The keys are not ordered.
|
2020 |
"""
|
|
2021 |
self._check_read() |
|
2022 |
return [node[1] for node in self._graph_index.iter_all_entries()] |
|
3735.31.2
by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts. |
2023 |
|
0.17.5
by Robert Collins
nograph tests completely passing. |
2024 |
def _node_to_position(self, node): |
2025 |
"""Convert an index value to position details."""
|
|
2026 |
bits = node[2].split(' ') |
|
2027 |
# It would be nice not to read the entire gzip.
|
|
2028 |
start = int(bits[0]) |
|
2029 |
stop = int(bits[1]) |
|
2030 |
basis_end = int(bits[2]) |
|
2031 |
delta_end = int(bits[3]) |
|
2032 |
return node[0], start, stop, basis_end, delta_end |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
2033 |
|
4343.3.2
by John Arbash Meinel
All stacking tests seem to be passing for dev6 repos |
2034 |
def scan_unvalidated_index(self, graph_index): |
2035 |
"""Inform this _GCGraphIndex that there is an unvalidated index.
|
|
2036 |
||
2037 |
This allows this _GCGraphIndex to keep track of any missing
|
|
2038 |
compression parents we may want to have filled in to make those
|
|
4634.29.3
by Andrew Bennetts
Simplify further. |
2039 |
indices valid. It also allows _GCGraphIndex to track any new keys.
|
4343.3.2
by John Arbash Meinel
All stacking tests seem to be passing for dev6 repos |
2040 |
|
2041 |
:param graph_index: A GraphIndex
|
|
2042 |
"""
|
|
4634.29.3
by Andrew Bennetts
Simplify further. |
2043 |
key_dependencies = self._key_dependencies |
4634.29.6
by Andrew Bennetts
Put new key tracking in _KeyRefs rather than alongside it. |
2044 |
if key_dependencies is None: |
4634.29.1
by Andrew Bennetts
Rough code to reject commit_write_group if any inventory's CHK root is absent. |
2045 |
return
|
2046 |
for node in graph_index.iter_all_entries(): |
|
4634.29.6
by Andrew Bennetts
Put new key tracking in _KeyRefs rather than alongside it. |
2047 |
# Add parent refs from graph_index (and discard parent refs
|
2048 |
# that the graph_index has).
|
|
2049 |
key_dependencies.add_references(node[1], node[3][0]) |
|
4343.3.2
by John Arbash Meinel
All stacking tests seem to be passing for dev6 repos |
2050 |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
2051 |
|
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
2052 |
from bzrlib._groupcompress_py import ( |
2053 |
apply_delta, |
|
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
2054 |
apply_delta_to_source, |
3735.40.11
by John Arbash Meinel
Implement make_delta and apply_delta. |
2055 |
encode_base128_int, |
2056 |
decode_base128_int, |
|
4300.1.1
by John Arbash Meinel
Add the ability to convert a gc block into 'human readable' form. |
2057 |
decode_copy_instruction, |
3735.40.13
by John Arbash Meinel
Rename EquivalenceTable to LinesDeltaIndex. |
2058 |
LinesDeltaIndex, |
3735.40.4
by John Arbash Meinel
Factor out tests that rely on the exact bytecode. |
2059 |
)
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
2060 |
try: |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
2061 |
from bzrlib._groupcompress_pyx import ( |
2062 |
apply_delta, |
|
3735.40.19
by John Arbash Meinel
Implement apply_delta_to_source which doesn't have to malloc another string. |
2063 |
apply_delta_to_source, |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
2064 |
DeltaIndex, |
3735.40.16
by John Arbash Meinel
Implement (de|en)code_base128_int in pyrex. |
2065 |
encode_base128_int, |
2066 |
decode_base128_int, |
|
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
2067 |
)
|
3735.40.2
by John Arbash Meinel
Add a groupcompress.encode_copy_instruction function. |
2068 |
GroupCompressor = PyrexGroupCompressor |
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
2069 |
except ImportError: |
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core. |
2070 |
GroupCompressor = PythonGroupCompressor |
2071 |