4988.10.3
by John Arbash Meinel
Merge bzr.dev 5007, resolve conflict, update NEWS |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
||
17 |
"""Persistent maps from tuple_of_strings->string using CHK stores.
|
|
18 |
||
19 |
Overview and current status:
|
|
20 |
||
21 |
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
|
|
22 |
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
|
|
23 |
joining them by \x00, and \x00 padding shorter keys out to the length of the
|
|
24 |
longest key. Leaf nodes are packed as densely as possible, and internal nodes
|
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
25 |
are all an additional 8-bits wide leading to a sparse upper tree.
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
26 |
|
27 |
Updates to a CHKMap are done preferentially via the apply_delta method, to
|
|
28 |
allow optimisation of the update operation; but individual map/unmap calls are
|
|
4526.9.5
by Robert Collins
Require that added ids in inventory deltas be new. |
29 |
possible and supported. Individual changes via map/unmap are buffered in memory
|
30 |
until the _save method is called to force serialisation of the tree.
|
|
31 |
apply_delta records its changes immediately by performing an implicit _save.
|
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
32 |
|
33 |
TODO:
|
|
34 |
-----
|
|
35 |
||
36 |
Densely packed upper nodes.
|
|
37 |
||
38 |
"""
|
|
39 |
||
40 |
import heapq |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
41 |
import threading |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
42 |
|
43 |
from bzrlib import lazy_import |
|
44 |
lazy_import.lazy_import(globals(), """ |
|
4526.9.5
by Robert Collins
Require that added ids in inventory deltas be new. |
45 |
from bzrlib import (
|
46 |
errors,
|
|
47 |
versionedfile,
|
|
48 |
)
|
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
49 |
""") |
50 |
from bzrlib import ( |
|
51 |
lru_cache, |
|
52 |
osutils, |
|
53 |
registry, |
|
4668.3.2
by John Arbash Meinel
Don't forget to import the library... |
54 |
static_tuple, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
55 |
trace, |
56 |
)
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
57 |
from bzrlib.static_tuple import StaticTuple |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
58 |
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
59 |
# approx 4MB
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
60 |
# If each line is 50 bytes, and you have 255 internal pages, with 255-way fan
|
61 |
# out, it takes 3.1MB to cache the layer.
|
|
62 |
_PAGE_CACHE_SIZE = 4*1024*1024 |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
63 |
# Per thread caches for 2 reasons:
|
64 |
# - in the server we may be serving very different content, so we get less
|
|
65 |
# cache thrashing.
|
|
66 |
# - we avoid locking on every cache lookup.
|
|
67 |
_thread_caches = threading.local() |
|
68 |
# The page cache.
|
|
69 |
_thread_caches.page_cache = None |
|
70 |
||
71 |
def _get_cache(): |
|
72 |
"""Get the per-thread page cache.
|
|
73 |
||
74 |
We need a function to do this because in a new thread the _thread_caches
|
|
75 |
threading.local object does not have the cache initialized yet.
|
|
76 |
"""
|
|
77 |
page_cache = getattr(_thread_caches, 'page_cache', None) |
|
78 |
if page_cache is None: |
|
79 |
# We are caching bytes so len(value) is perfectly accurate
|
|
80 |
page_cache = lru_cache.LRUSizeCache(_PAGE_CACHE_SIZE) |
|
81 |
_thread_caches.page_cache = page_cache |
|
82 |
return page_cache |
|
83 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
84 |
|
4543.2.2
by John Arbash Meinel
work out some tests that expose that bundles don't work w/ 2a formats. |
85 |
def clear_cache(): |
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
86 |
_get_cache().clear() |
87 |
||
4543.2.2
by John Arbash Meinel
work out some tests that expose that bundles don't work w/ 2a formats. |
88 |
|
3735.2.123
by Ian Clatworthy
only check for remap if changes are interesting in size |
89 |
# If a ChildNode falls below this many bytes, we check for a remap
|
90 |
_INTERESTING_NEW_SIZE = 50 |
|
91 |
# If a ChildNode shrinks by more than this amount, we check for a remap
|
|
92 |
_INTERESTING_SHRINKAGE_LIMIT = 20 |
|
93 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
94 |
|
95 |
def _search_key_plain(key): |
|
96 |
"""Map the key tuple into a search string that just uses the key bytes."""
|
|
97 |
return '\x00'.join(key) |
|
98 |
||
99 |
||
100 |
search_key_registry = registry.Registry() |
|
101 |
search_key_registry.register('plain', _search_key_plain) |
|
102 |
||
103 |
||
104 |
class CHKMap(object): |
|
105 |
"""A persistent map from string to string backed by a CHK store."""
|
|
106 |
||
4759.1.2
by John Arbash Meinel
Change CHKMap to use __slots__ |
107 |
__slots__ = ('_store', '_root_node', '_search_key_func') |
108 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
109 |
def __init__(self, store, root_key, search_key_func=None): |
110 |
"""Create a CHKMap object.
|
|
111 |
||
112 |
:param store: The store the CHKMap is stored in.
|
|
113 |
:param root_key: The root key of the map. None to create an empty
|
|
114 |
CHKMap.
|
|
115 |
:param search_key_func: A function mapping a key => bytes. These bytes
|
|
116 |
are then used by the internal nodes to split up leaf nodes into
|
|
117 |
multiple pages.
|
|
118 |
"""
|
|
119 |
self._store = store |
|
120 |
if search_key_func is None: |
|
121 |
search_key_func = _search_key_plain |
|
122 |
self._search_key_func = search_key_func |
|
123 |
if root_key is None: |
|
124 |
self._root_node = LeafNode(search_key_func=search_key_func) |
|
125 |
else: |
|
126 |
self._root_node = self._node_key(root_key) |
|
127 |
||
128 |
def apply_delta(self, delta): |
|
129 |
"""Apply a delta to the map.
|
|
130 |
||
131 |
:param delta: An iterable of old_key, new_key, new_value tuples.
|
|
132 |
If new_key is not None, then new_key->new_value is inserted
|
|
133 |
into the map; if old_key is not None, then the old mapping
|
|
134 |
of old_key is removed.
|
|
135 |
"""
|
|
4797.62.2
by Andrew Bennetts
Change delete_count to has_deletes bool, remove unused INTERESTING_DELETES_LIMIT. |
136 |
has_deletes = False |
4526.9.5
by Robert Collins
Require that added ids in inventory deltas be new. |
137 |
# Check preconditions first.
|
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
138 |
as_st = StaticTuple.from_sequence |
139 |
new_items = set([as_st(key) for (old, key, value) in delta |
|
140 |
if key is not None and old is None]) |
|
4526.9.5
by Robert Collins
Require that added ids in inventory deltas be new. |
141 |
existing_new = list(self.iteritems(key_filter=new_items)) |
142 |
if existing_new: |
|
143 |
raise errors.InconsistentDeltaDelta(delta, |
|
144 |
"New items are already in the map %r." % existing_new) |
|
145 |
# Now apply changes.
|
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
146 |
for old, new, value in delta: |
147 |
if old is not None and old != new: |
|
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
148 |
self.unmap(old, check_remap=False) |
4797.62.2
by Andrew Bennetts
Change delete_count to has_deletes bool, remove unused INTERESTING_DELETES_LIMIT. |
149 |
has_deletes = True |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
150 |
for old, new, value in delta: |
151 |
if new is not None: |
|
152 |
self.map(new, value) |
|
4797.62.2
by Andrew Bennetts
Change delete_count to has_deletes bool, remove unused INTERESTING_DELETES_LIMIT. |
153 |
if has_deletes: |
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
154 |
self._check_remap() |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
155 |
return self._save() |
156 |
||
157 |
def _ensure_root(self): |
|
158 |
"""Ensure that the root node is an object not a key."""
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
159 |
if type(self._root_node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
160 |
# Demand-load the root
|
161 |
self._root_node = self._get_node(self._root_node) |
|
162 |
||
163 |
def _get_node(self, node): |
|
164 |
"""Get a node.
|
|
165 |
||
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
166 |
Note that this does not update the _items dict in objects containing a
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
167 |
reference to this node. As such it does not prevent subsequent IO being
|
168 |
performed.
|
|
169 |
||
170 |
:param node: A tuple key or node object.
|
|
171 |
:return: A node object.
|
|
172 |
"""
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
173 |
if type(node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
174 |
bytes = self._read_bytes(node) |
175 |
return _deserialise(bytes, node, |
|
176 |
search_key_func=self._search_key_func) |
|
177 |
else: |
|
178 |
return node |
|
179 |
||
180 |
def _read_bytes(self, key): |
|
3735.2.124
by Ian Clatworthy
use the page cache in CHKMap._read_bytes() |
181 |
try: |
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
182 |
return _get_cache()[key] |
3735.2.124
by Ian Clatworthy
use the page cache in CHKMap._read_bytes() |
183 |
except KeyError: |
184 |
stream = self._store.get_record_stream([key], 'unordered', True) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
185 |
bytes = stream.next().get_bytes_as('fulltext') |
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
186 |
_get_cache()[key] = bytes |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
187 |
return bytes |
188 |
||
189 |
def _dump_tree(self, include_keys=False): |
|
190 |
"""Return the tree in a string representation."""
|
|
191 |
self._ensure_root() |
|
192 |
res = self._dump_tree_node(self._root_node, prefix='', indent='', |
|
193 |
include_keys=include_keys) |
|
194 |
res.append('') # Give a trailing '\n' |
|
195 |
return '\n'.join(res) |
|
196 |
||
197 |
def _dump_tree_node(self, node, prefix, indent, include_keys=True): |
|
198 |
"""For this node and all children, generate a string representation."""
|
|
199 |
result = [] |
|
200 |
if not include_keys: |
|
201 |
key_str = '' |
|
202 |
else: |
|
203 |
node_key = node.key() |
|
204 |
if node_key is not None: |
|
205 |
key_str = ' %s' % (node_key[0],) |
|
206 |
else: |
|
207 |
key_str = ' None' |
|
208 |
result.append('%s%r %s%s' % (indent, prefix, node.__class__.__name__, |
|
209 |
key_str)) |
|
210 |
if type(node) is InternalNode: |
|
211 |
# Trigger all child nodes to get loaded
|
|
212 |
list(node._iter_nodes(self._store)) |
|
213 |
for prefix, sub in sorted(node._items.iteritems()): |
|
214 |
result.extend(self._dump_tree_node(sub, prefix, indent + ' ', |
|
215 |
include_keys=include_keys)) |
|
216 |
else: |
|
217 |
for key, value in sorted(node._items.iteritems()): |
|
218 |
# Don't use prefix nor indent here to line up when used in
|
|
219 |
# tests in conjunction with assertEqualDiff
|
|
4679.9.1
by John Arbash Meinel
Merge in the static-tuple-no-use branch, and bring back the chk_map use. |
220 |
result.append(' %r %r' % (tuple(key), value)) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
221 |
return result |
222 |
||
223 |
@classmethod
|
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
224 |
def from_dict(klass, store, initial_value, maximum_size=0, key_width=1, |
225 |
search_key_func=None): |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
226 |
"""Create a CHKMap in store with initial_value as the content.
|
227 |
||
228 |
:param store: The store to record initial_value in, a VersionedFiles
|
|
229 |
object with 1-tuple keys supporting CHK key generation.
|
|
230 |
:param initial_value: A dict to store in store. Its keys and values
|
|
231 |
must be bytestrings.
|
|
232 |
:param maximum_size: The maximum_size rule to apply to nodes. This
|
|
233 |
determines the size at which no new data is added to a single node.
|
|
234 |
:param key_width: The number of elements in each key_tuple being stored
|
|
235 |
in this map.
|
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
236 |
:param search_key_func: A function mapping a key => bytes. These bytes
|
237 |
are then used by the internal nodes to split up leaf nodes into
|
|
238 |
multiple pages.
|
|
239 |
:return: The root chk of the resulting CHKMap.
|
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
240 |
"""
|
4413.5.7
by John Arbash Meinel
Switch to using a single code path for from_dict(). |
241 |
root_key = klass._create_directly(store, initial_value, |
242 |
maximum_size=maximum_size, key_width=key_width, |
|
243 |
search_key_func=search_key_func) |
|
4679.9.15
by John Arbash Meinel
Cleanup some code paths. Make _check_key a helper that can be used |
244 |
if type(root_key) is not StaticTuple: |
245 |
raise AssertionError('we got a %s instead of a StaticTuple' |
|
246 |
% (type(root_key),)) |
|
4413.5.5
by John Arbash Meinel
Make it more obvious how the two creation methods are defined. |
247 |
return root_key |
248 |
||
249 |
@classmethod
|
|
250 |
def _create_via_map(klass, store, initial_value, maximum_size=0, |
|
251 |
key_width=1, search_key_func=None): |
|
252 |
result = klass(store, None, search_key_func=search_key_func) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
253 |
result._root_node.set_maximum_size(maximum_size) |
254 |
result._root_node._key_width = key_width |
|
255 |
delta = [] |
|
256 |
for key, value in initial_value.items(): |
|
257 |
delta.append((None, key, value)) |
|
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
258 |
root_key = result.apply_delta(delta) |
4413.5.5
by John Arbash Meinel
Make it more obvious how the two creation methods are defined. |
259 |
return root_key |
260 |
||
261 |
@classmethod
|
|
262 |
def _create_directly(klass, store, initial_value, maximum_size=0, |
|
263 |
key_width=1, search_key_func=None): |
|
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
264 |
node = LeafNode(search_key_func=search_key_func) |
265 |
node.set_maximum_size(maximum_size) |
|
266 |
node._key_width = key_width |
|
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
267 |
as_st = StaticTuple.from_sequence |
268 |
node._items = dict([(as_st(key), val) for key, val |
|
269 |
in initial_value.iteritems()]) |
|
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
270 |
node._raw_size = sum([node._key_value_len(key, value) |
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
271 |
for key,value in node._items.iteritems()]) |
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
272 |
node._len = len(node._items) |
273 |
node._compute_search_prefix() |
|
274 |
node._compute_serialised_prefix() |
|
275 |
if (node._len > 1 |
|
276 |
and maximum_size |
|
277 |
and node._current_size() > maximum_size): |
|
278 |
prefix, node_details = node._split(store) |
|
4413.5.8
by John Arbash Meinel
Change some asserts into raise: calls. |
279 |
if len(node_details) == 1: |
280 |
raise AssertionError('Failed to split using node._split') |
|
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
281 |
node = InternalNode(prefix, search_key_func=search_key_func) |
282 |
node.set_maximum_size(maximum_size) |
|
283 |
node._key_width = key_width |
|
284 |
for split, subnode in node_details: |
|
285 |
node.add_node(split, subnode) |
|
286 |
keys = list(node.serialise(store)) |
|
4679.9.15
by John Arbash Meinel
Cleanup some code paths. Make _check_key a helper that can be used |
287 |
return keys[-1] |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
288 |
|
289 |
def iter_changes(self, basis): |
|
290 |
"""Iterate over the changes between basis and self.
|
|
291 |
||
292 |
:return: An iterator of tuples: (key, old_value, new_value). Old_value
|
|
293 |
is None for keys only in self; new_value is None for keys only in
|
|
294 |
basis.
|
|
295 |
"""
|
|
296 |
# Overview:
|
|
297 |
# Read both trees in lexographic, highest-first order.
|
|
298 |
# Any identical nodes we skip
|
|
299 |
# Any unique prefixes we output immediately.
|
|
300 |
# values in a leaf node are treated as single-value nodes in the tree
|
|
301 |
# which allows them to be not-special-cased. We know to output them
|
|
302 |
# because their value is a string, not a key(tuple) or node.
|
|
303 |
#
|
|
304 |
# corner cases to beware of when considering this function:
|
|
305 |
# *) common references are at different heights.
|
|
306 |
# consider two trees:
|
|
307 |
# {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
|
|
308 |
# {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'},
|
|
309 |
# 'ab':LeafNode={'ab':'bar'}}
|
|
310 |
# 'b': LeafNode={'b'}}
|
|
311 |
# the node with aaa/aab will only be encountered in the second tree
|
|
312 |
# after reading the 'a' subtree, but it is encountered in the first
|
|
313 |
# tree immediately. Variations on this may have read internal nodes
|
|
314 |
# like this. we want to cut the entire pending subtree when we
|
|
315 |
# realise we have a common node. For this we use a list of keys -
|
|
316 |
# the path to a node - and check the entire path is clean as we
|
|
317 |
# process each item.
|
|
318 |
if self._node_key(self._root_node) == self._node_key(basis._root_node): |
|
319 |
return
|
|
320 |
self._ensure_root() |
|
321 |
basis._ensure_root() |
|
322 |
excluded_keys = set() |
|
323 |
self_node = self._root_node |
|
324 |
basis_node = basis._root_node |
|
325 |
# A heap, each element is prefix, node(tuple/NodeObject/string),
|
|
326 |
# key_path (a list of tuples, tail-sharing down the tree.)
|
|
327 |
self_pending = [] |
|
328 |
basis_pending = [] |
|
329 |
def process_node(node, path, a_map, pending): |
|
330 |
# take a node and expand it
|
|
331 |
node = a_map._get_node(node) |
|
332 |
if type(node) == LeafNode: |
|
333 |
path = (node._key, path) |
|
334 |
for key, value in node._items.items(): |
|
335 |
# For a LeafNode, the key is a serialized_key, rather than
|
|
336 |
# a search_key, but the heap is using search_keys
|
|
337 |
search_key = node._search_key_func(key) |
|
338 |
heapq.heappush(pending, (search_key, key, value, path)) |
|
339 |
else: |
|
340 |
# type(node) == InternalNode
|
|
341 |
path = (node._key, path) |
|
342 |
for prefix, child in node._items.items(): |
|
343 |
heapq.heappush(pending, (prefix, None, child, path)) |
|
344 |
def process_common_internal_nodes(self_node, basis_node): |
|
345 |
self_items = set(self_node._items.items()) |
|
346 |
basis_items = set(basis_node._items.items()) |
|
347 |
path = (self_node._key, None) |
|
348 |
for prefix, child in self_items - basis_items: |
|
349 |
heapq.heappush(self_pending, (prefix, None, child, path)) |
|
350 |
path = (basis_node._key, None) |
|
351 |
for prefix, child in basis_items - self_items: |
|
352 |
heapq.heappush(basis_pending, (prefix, None, child, path)) |
|
353 |
def process_common_leaf_nodes(self_node, basis_node): |
|
354 |
self_items = set(self_node._items.items()) |
|
355 |
basis_items = set(basis_node._items.items()) |
|
356 |
path = (self_node._key, None) |
|
357 |
for key, value in self_items - basis_items: |
|
358 |
prefix = self._search_key_func(key) |
|
359 |
heapq.heappush(self_pending, (prefix, key, value, path)) |
|
360 |
path = (basis_node._key, None) |
|
361 |
for key, value in basis_items - self_items: |
|
362 |
prefix = basis._search_key_func(key) |
|
363 |
heapq.heappush(basis_pending, (prefix, key, value, path)) |
|
364 |
def process_common_prefix_nodes(self_node, self_path, |
|
365 |
basis_node, basis_path): |
|
366 |
# Would it be more efficient if we could request both at the same
|
|
367 |
# time?
|
|
368 |
self_node = self._get_node(self_node) |
|
369 |
basis_node = basis._get_node(basis_node) |
|
370 |
if (type(self_node) == InternalNode |
|
371 |
and type(basis_node) == InternalNode): |
|
372 |
# Matching internal nodes
|
|
373 |
process_common_internal_nodes(self_node, basis_node) |
|
374 |
elif (type(self_node) == LeafNode |
|
375 |
and type(basis_node) == LeafNode): |
|
376 |
process_common_leaf_nodes(self_node, basis_node) |
|
377 |
else: |
|
378 |
process_node(self_node, self_path, self, self_pending) |
|
379 |
process_node(basis_node, basis_path, basis, basis_pending) |
|
380 |
process_common_prefix_nodes(self_node, None, basis_node, None) |
|
381 |
self_seen = set() |
|
382 |
basis_seen = set() |
|
383 |
excluded_keys = set() |
|
384 |
def check_excluded(key_path): |
|
385 |
# Note that this is N^2, it depends on us trimming trees
|
|
386 |
# aggressively to not become slow.
|
|
387 |
# A better implementation would probably have a reverse map
|
|
388 |
# back to the children of a node, and jump straight to it when
|
|
389 |
# a common node is detected, the proceed to remove the already
|
|
390 |
# pending children. bzrlib.graph has a searcher module with a
|
|
391 |
# similar problem.
|
|
392 |
while key_path is not None: |
|
393 |
key, key_path = key_path |
|
394 |
if key in excluded_keys: |
|
395 |
return True |
|
396 |
return False |
|
397 |
||
398 |
loop_counter = 0 |
|
399 |
while self_pending or basis_pending: |
|
400 |
loop_counter += 1 |
|
401 |
if not self_pending: |
|
402 |
# self is exhausted: output remainder of basis
|
|
403 |
for prefix, key, node, path in basis_pending: |
|
404 |
if check_excluded(path): |
|
405 |
continue
|
|
406 |
node = basis._get_node(node) |
|
407 |
if key is not None: |
|
408 |
# a value
|
|
409 |
yield (key, node, None) |
|
410 |
else: |
|
411 |
# subtree - fastpath the entire thing.
|
|
412 |
for key, value in node.iteritems(basis._store): |
|
413 |
yield (key, value, None) |
|
414 |
return
|
|
415 |
elif not basis_pending: |
|
416 |
# basis is exhausted: output remainder of self.
|
|
417 |
for prefix, key, node, path in self_pending: |
|
418 |
if check_excluded(path): |
|
419 |
continue
|
|
420 |
node = self._get_node(node) |
|
421 |
if key is not None: |
|
422 |
# a value
|
|
423 |
yield (key, None, node) |
|
424 |
else: |
|
425 |
# subtree - fastpath the entire thing.
|
|
426 |
for key, value in node.iteritems(self._store): |
|
427 |
yield (key, None, value) |
|
428 |
return
|
|
429 |
else: |
|
430 |
# XXX: future optimisation - yield the smaller items
|
|
431 |
# immediately rather than pushing everything on/off the
|
|
432 |
# heaps. Applies to both internal nodes and leafnodes.
|
|
433 |
if self_pending[0][0] < basis_pending[0][0]: |
|
434 |
# expand self
|
|
435 |
prefix, key, node, path = heapq.heappop(self_pending) |
|
436 |
if check_excluded(path): |
|
437 |
continue
|
|
438 |
if key is not None: |
|
439 |
# a value
|
|
440 |
yield (key, None, node) |
|
441 |
else: |
|
442 |
process_node(node, path, self, self_pending) |
|
443 |
continue
|
|
444 |
elif self_pending[0][0] > basis_pending[0][0]: |
|
445 |
# expand basis
|
|
446 |
prefix, key, node, path = heapq.heappop(basis_pending) |
|
447 |
if check_excluded(path): |
|
448 |
continue
|
|
449 |
if key is not None: |
|
450 |
# a value
|
|
451 |
yield (key, node, None) |
|
452 |
else: |
|
453 |
process_node(node, path, basis, basis_pending) |
|
454 |
continue
|
|
455 |
else: |
|
456 |
# common prefix: possibly expand both
|
|
457 |
if self_pending[0][1] is None: |
|
458 |
# process next self
|
|
459 |
read_self = True |
|
460 |
else: |
|
461 |
read_self = False |
|
462 |
if basis_pending[0][1] is None: |
|
463 |
# process next basis
|
|
464 |
read_basis = True |
|
465 |
else: |
|
466 |
read_basis = False |
|
467 |
if not read_self and not read_basis: |
|
468 |
# compare a common value
|
|
469 |
self_details = heapq.heappop(self_pending) |
|
470 |
basis_details = heapq.heappop(basis_pending) |
|
471 |
if self_details[2] != basis_details[2]: |
|
472 |
yield (self_details[1], |
|
473 |
basis_details[2], self_details[2]) |
|
474 |
continue
|
|
475 |
# At least one side wasn't a simple value
|
|
476 |
if (self._node_key(self_pending[0][2]) == |
|
477 |
self._node_key(basis_pending[0][2])): |
|
478 |
# Identical pointers, skip (and don't bother adding to
|
|
479 |
# excluded, it won't turn up again.
|
|
480 |
heapq.heappop(self_pending) |
|
481 |
heapq.heappop(basis_pending) |
|
482 |
continue
|
|
483 |
# Now we need to expand this node before we can continue
|
|
484 |
if read_self and read_basis: |
|
485 |
# Both sides start with the same prefix, so process
|
|
486 |
# them in parallel
|
|
487 |
self_prefix, _, self_node, self_path = heapq.heappop( |
|
488 |
self_pending) |
|
489 |
basis_prefix, _, basis_node, basis_path = heapq.heappop( |
|
490 |
basis_pending) |
|
491 |
if self_prefix != basis_prefix: |
|
492 |
raise AssertionError( |
|
493 |
'%r != %r' % (self_prefix, basis_prefix)) |
|
494 |
process_common_prefix_nodes( |
|
495 |
self_node, self_path, |
|
496 |
basis_node, basis_path) |
|
497 |
continue
|
|
498 |
if read_self: |
|
499 |
prefix, key, node, path = heapq.heappop(self_pending) |
|
500 |
if check_excluded(path): |
|
501 |
continue
|
|
502 |
process_node(node, path, self, self_pending) |
|
503 |
if read_basis: |
|
504 |
prefix, key, node, path = heapq.heappop(basis_pending) |
|
505 |
if check_excluded(path): |
|
506 |
continue
|
|
507 |
process_node(node, path, basis, basis_pending) |
|
508 |
# print loop_counter
|
|
509 |
||
510 |
def iteritems(self, key_filter=None): |
|
511 |
"""Iterate over the entire CHKMap's contents."""
|
|
512 |
self._ensure_root() |
|
4679.9.10
by John Arbash Meinel
Change the testing layer so that CHKMap is tested using tuples. |
513 |
if key_filter is not None: |
514 |
as_st = StaticTuple.from_sequence |
|
515 |
key_filter = [as_st(key) for key in key_filter] |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
516 |
return self._root_node.iteritems(self._store, key_filter=key_filter) |
517 |
||
518 |
def key(self): |
|
519 |
"""Return the key for this map."""
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
520 |
if type(self._root_node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
521 |
return self._root_node |
522 |
else: |
|
523 |
return self._root_node._key |
|
524 |
||
525 |
def __len__(self): |
|
526 |
self._ensure_root() |
|
527 |
return len(self._root_node) |
|
528 |
||
529 |
def map(self, key, value): |
|
4526.9.5
by Robert Collins
Require that added ids in inventory deltas be new. |
530 |
"""Map a key tuple to value.
|
531 |
|
|
532 |
:param key: A key to map.
|
|
533 |
:param value: The value to assign to key.
|
|
534 |
"""
|
|
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
535 |
key = StaticTuple.from_sequence(key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
536 |
# Need a root object.
|
537 |
self._ensure_root() |
|
538 |
prefix, node_details = self._root_node.map(self._store, key, value) |
|
539 |
if len(node_details) == 1: |
|
540 |
self._root_node = node_details[0][1] |
|
541 |
else: |
|
542 |
self._root_node = InternalNode(prefix, |
|
543 |
search_key_func=self._search_key_func) |
|
544 |
self._root_node.set_maximum_size(node_details[0][1].maximum_size) |
|
545 |
self._root_node._key_width = node_details[0][1]._key_width |
|
546 |
for split, node in node_details: |
|
547 |
self._root_node.add_node(split, node) |
|
548 |
||
549 |
def _node_key(self, node): |
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
550 |
"""Get the key for a node whether it's a tuple or node."""
|
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
551 |
if type(node) is tuple: |
552 |
node = StaticTuple.from_sequence(node) |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
553 |
if type(node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
554 |
return node |
555 |
else: |
|
556 |
return node._key |
|
557 |
||
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
558 |
def unmap(self, key, check_remap=True): |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
559 |
"""remove key from the map."""
|
4679.9.9
by John Arbash Meinel
Create a Barrier at the CHKMap interface. |
560 |
key = StaticTuple.from_sequence(key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
561 |
self._ensure_root() |
562 |
if type(self._root_node) is InternalNode: |
|
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
563 |
unmapped = self._root_node.unmap(self._store, key, |
564 |
check_remap=check_remap) |
|
565 |
else: |
|
566 |
unmapped = self._root_node.unmap(self._store, key) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
567 |
self._root_node = unmapped |
568 |
||
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
569 |
def _check_remap(self): |
570 |
"""Check if nodes can be collapsed."""
|
|
571 |
self._ensure_root() |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
572 |
if type(self._root_node) is InternalNode: |
4797.62.1
by Andrew Bennetts
Fix bug in CHKMap.apply_delta that allowed it to create non-canonical trees. |
573 |
self._root_node = self._root_node._check_remap(self._store) |
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
574 |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
575 |
def _save(self): |
576 |
"""Save the map completely.
|
|
577 |
||
578 |
:return: The key of the root node.
|
|
579 |
"""
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
580 |
if type(self._root_node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
581 |
# Already saved.
|
582 |
return self._root_node |
|
583 |
keys = list(self._root_node.serialise(self._store)) |
|
584 |
return keys[-1] |
|
585 |
||
586 |
||
587 |
class Node(object): |
|
588 |
"""Base class defining the protocol for CHK Map nodes.
|
|
589 |
||
590 |
:ivar _raw_size: The total size of the serialized key:value data, before
|
|
591 |
adding the header bytes, and without prefix compression.
|
|
592 |
"""
|
|
593 |
||
4759.1.2
by John Arbash Meinel
Change CHKMap to use __slots__ |
594 |
__slots__ = ('_key', '_len', '_maximum_size', '_key_width', |
595 |
'_raw_size', '_items', '_search_prefix', '_search_key_func' |
|
596 |
)
|
|
597 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
598 |
def __init__(self, key_width=1): |
599 |
"""Create a node.
|
|
600 |
||
601 |
:param key_width: The width of keys for this node.
|
|
602 |
"""
|
|
603 |
self._key = None |
|
604 |
# Current number of elements
|
|
605 |
self._len = 0 |
|
606 |
self._maximum_size = 0 |
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
607 |
self._key_width = key_width |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
608 |
# current size in bytes
|
609 |
self._raw_size = 0 |
|
610 |
# The pointers/values this node has - meaning defined by child classes.
|
|
611 |
self._items = {} |
|
612 |
# The common search prefix
|
|
613 |
self._search_prefix = None |
|
614 |
||
615 |
def __repr__(self): |
|
616 |
items_str = str(sorted(self._items)) |
|
617 |
if len(items_str) > 20: |
|
3735.2.154
by Ian Clatworthy
fix chk_map Node %r formatting |
618 |
items_str = items_str[:16] + '...]' |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
619 |
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % ( |
620 |
self.__class__.__name__, self._key, self._len, self._raw_size, |
|
621 |
self._maximum_size, self._search_prefix, items_str) |
|
622 |
||
623 |
def key(self): |
|
624 |
return self._key |
|
625 |
||
626 |
def __len__(self): |
|
627 |
return self._len |
|
628 |
||
629 |
@property
|
|
630 |
def maximum_size(self): |
|
631 |
"""What is the upper limit for adding references to a node."""
|
|
632 |
return self._maximum_size |
|
633 |
||
634 |
def set_maximum_size(self, new_size): |
|
635 |
"""Set the size threshold for nodes.
|
|
636 |
||
637 |
:param new_size: The size at which no data is added to a node. 0 for
|
|
638 |
unlimited.
|
|
639 |
"""
|
|
640 |
self._maximum_size = new_size |
|
641 |
||
642 |
@classmethod
|
|
643 |
def common_prefix(cls, prefix, key): |
|
644 |
"""Given 2 strings, return the longest prefix common to both.
|
|
645 |
||
646 |
:param prefix: This has been the common prefix for other keys, so it is
|
|
647 |
more likely to be the common prefix in this case as well.
|
|
648 |
:param key: Another string to compare to
|
|
649 |
"""
|
|
650 |
if key.startswith(prefix): |
|
651 |
return prefix |
|
4358.1.1
by Jelmer Vernooij
Support empty keys when looking for common prefixes in CHKMap. |
652 |
pos = -1 |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
653 |
# Is there a better way to do this?
|
654 |
for pos, (left, right) in enumerate(zip(prefix, key)): |
|
655 |
if left != right: |
|
656 |
pos -= 1 |
|
657 |
break
|
|
658 |
common = prefix[:pos+1] |
|
659 |
return common |
|
660 |
||
661 |
@classmethod
|
|
662 |
def common_prefix_for_keys(cls, keys): |
|
663 |
"""Given a list of keys, find their common prefix.
|
|
664 |
||
665 |
:param keys: An iterable of strings.
|
|
666 |
:return: The longest common prefix of all keys.
|
|
667 |
"""
|
|
668 |
common_prefix = None |
|
669 |
for key in keys: |
|
670 |
if common_prefix is None: |
|
671 |
common_prefix = key |
|
672 |
continue
|
|
673 |
common_prefix = cls.common_prefix(common_prefix, key) |
|
674 |
if not common_prefix: |
|
675 |
# if common_prefix is the empty string, then we know it won't
|
|
676 |
# change further
|
|
677 |
return '' |
|
678 |
return common_prefix |
|
679 |
||
680 |
||
681 |
# Singleton indicating we have not computed _search_prefix yet
|
|
682 |
_unknown = object() |
|
683 |
||
684 |
class LeafNode(Node): |
|
685 |
"""A node containing actual key:value pairs.
|
|
686 |
||
687 |
:ivar _items: A dict of key->value items. The key is in tuple form.
|
|
688 |
:ivar _size: The number of bytes that would be used by serializing all of
|
|
689 |
the key/value pairs.
|
|
690 |
"""
|
|
691 |
||
5169.3.1
by Martin
Make LeafNode._serialise_key a static method on the class rather than the instance |
692 |
__slots__ = ('_common_serialised_prefix',) |
4759.1.2
by John Arbash Meinel
Change CHKMap to use __slots__ |
693 |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
694 |
def __init__(self, search_key_func=None): |
695 |
Node.__init__(self) |
|
696 |
# All of the keys in this leaf node share this common prefix
|
|
697 |
self._common_serialised_prefix = None |
|
698 |
if search_key_func is None: |
|
699 |
self._search_key_func = _search_key_plain |
|
700 |
else: |
|
701 |
self._search_key_func = search_key_func |
|
702 |
||
703 |
def __repr__(self): |
|
3735.2.154
by Ian Clatworthy
fix chk_map Node %r formatting |
704 |
items_str = str(sorted(self._items)) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
705 |
if len(items_str) > 20: |
3735.2.154
by Ian Clatworthy
fix chk_map Node %r formatting |
706 |
items_str = items_str[:16] + '...]' |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
707 |
return \
|
708 |
'%s(key:%s len:%s size:%s max:%s prefix:%s keywidth:%s items:%s)' \ |
|
709 |
% (self.__class__.__name__, self._key, self._len, self._raw_size, |
|
710 |
self._maximum_size, self._search_prefix, self._key_width, items_str) |
|
711 |
||
712 |
def _current_size(self): |
|
713 |
"""Answer the current serialised size of this node.
|
|
714 |
||
715 |
This differs from self._raw_size in that it includes the bytes used for
|
|
716 |
the header.
|
|
717 |
"""
|
|
718 |
if self._common_serialised_prefix is None: |
|
719 |
bytes_for_items = 0 |
|
720 |
prefix_len = 0 |
|
721 |
else: |
|
722 |
# We will store a single string with the common prefix
|
|
723 |
# And then that common prefix will not be stored in any of the
|
|
724 |
# entry lines
|
|
725 |
prefix_len = len(self._common_serialised_prefix) |
|
726 |
bytes_for_items = (self._raw_size - (prefix_len * self._len)) |
|
727 |
return (9 # 'chkleaf:\n' |
|
728 |
+ len(str(self._maximum_size)) + 1 |
|
729 |
+ len(str(self._key_width)) + 1 |
|
730 |
+ len(str(self._len)) + 1 |
|
731 |
+ prefix_len + 1 |
|
732 |
+ bytes_for_items) |
|
733 |
||
734 |
@classmethod
|
|
735 |
def deserialise(klass, bytes, key, search_key_func=None): |
|
736 |
"""Deserialise bytes, with key key, into a LeafNode.
|
|
737 |
||
738 |
:param bytes: The bytes of the node.
|
|
739 |
:param key: The key that the serialised node has.
|
|
740 |
"""
|
|
4668.3.1
by John Arbash Meinel
Fix bug #471193, allow tuples into the CHK code. |
741 |
key = static_tuple.expect_static_tuple(key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
742 |
return _deserialise_leaf_node(bytes, key, |
743 |
search_key_func=search_key_func) |
|
744 |
||
745 |
def iteritems(self, store, key_filter=None): |
|
746 |
"""Iterate over items in the node.
|
|
747 |
||
748 |
:param key_filter: A filter to apply to the node. It should be a
|
|
749 |
list/set/dict or similar repeatedly iterable container.
|
|
750 |
"""
|
|
751 |
if key_filter is not None: |
|
752 |
# Adjust the filter - short elements go to a prefix filter. All
|
|
753 |
# other items are looked up directly.
|
|
754 |
# XXX: perhaps defaultdict? Profiling<rinse and repeat>
|
|
755 |
filters = {} |
|
756 |
for key in key_filter: |
|
757 |
if len(key) == self._key_width: |
|
758 |
# This filter is meant to match exactly one key, yield it
|
|
759 |
# if we have it.
|
|
760 |
try: |
|
761 |
yield key, self._items[key] |
|
762 |
except KeyError: |
|
763 |
# This key is not present in this map, continue
|
|
764 |
pass
|
|
765 |
else: |
|
766 |
# Short items, we need to match based on a prefix
|
|
767 |
length_filter = filters.setdefault(len(key), set()) |
|
768 |
length_filter.add(key) |
|
769 |
if filters: |
|
770 |
filters = filters.items() |
|
771 |
for item in self._items.iteritems(): |
|
772 |
for length, length_filter in filters: |
|
773 |
if item[0][:length] in length_filter: |
|
774 |
yield item |
|
775 |
break
|
|
776 |
else: |
|
777 |
for item in self._items.iteritems(): |
|
778 |
yield item |
|
779 |
||
780 |
def _key_value_len(self, key, value): |
|
781 |
# TODO: Should probably be done without actually joining the key, but
|
|
782 |
# then that can be done via the C extension
|
|
783 |
return (len(self._serialise_key(key)) + 1 |
|
784 |
+ len(str(value.count('\n'))) + 1 |
|
785 |
+ len(value) + 1) |
|
786 |
||
787 |
def _search_key(self, key): |
|
788 |
return self._search_key_func(key) |
|
789 |
||
790 |
def _map_no_split(self, key, value): |
|
791 |
"""Map a key to a value.
|
|
792 |
||
793 |
This assumes either the key does not already exist, or you have already
|
|
794 |
removed its size and length from self.
|
|
795 |
||
796 |
:return: True if adding this node should cause us to split.
|
|
797 |
"""
|
|
798 |
self._items[key] = value |
|
799 |
self._raw_size += self._key_value_len(key, value) |
|
800 |
self._len += 1 |
|
801 |
serialised_key = self._serialise_key(key) |
|
802 |
if self._common_serialised_prefix is None: |
|
803 |
self._common_serialised_prefix = serialised_key |
|
804 |
else: |
|
805 |
self._common_serialised_prefix = self.common_prefix( |
|
806 |
self._common_serialised_prefix, serialised_key) |
|
807 |
search_key = self._search_key(key) |
|
808 |
if self._search_prefix is _unknown: |
|
809 |
self._compute_search_prefix() |
|
810 |
if self._search_prefix is None: |
|
811 |
self._search_prefix = search_key |
|
812 |
else: |
|
813 |
self._search_prefix = self.common_prefix( |
|
814 |
self._search_prefix, search_key) |
|
815 |
if (self._len > 1 |
|
816 |
and self._maximum_size |
|
817 |
and self._current_size() > self._maximum_size): |
|
818 |
# Check to see if all of the search_keys for this node are
|
|
819 |
# identical. We allow the node to grow under that circumstance
|
|
820 |
# (we could track this as common state, but it is infrequent)
|
|
821 |
if (search_key != self._search_prefix |
|
822 |
or not self._are_search_keys_identical()): |
|
823 |
return True |
|
824 |
return False |
|
825 |
||
826 |
def _split(self, store): |
|
827 |
"""We have overflowed.
|
|
828 |
||
829 |
Split this node into multiple LeafNodes, return it up the stack so that
|
|
830 |
the next layer creates a new InternalNode and references the new nodes.
|
|
831 |
||
832 |
:return: (common_serialised_prefix, [(node_serialised_prefix, node)])
|
|
833 |
"""
|
|
834 |
if self._search_prefix is _unknown: |
|
835 |
raise AssertionError('Search prefix must be known') |
|
836 |
common_prefix = self._search_prefix |
|
837 |
split_at = len(common_prefix) + 1 |
|
838 |
result = {} |
|
839 |
for key, value in self._items.iteritems(): |
|
840 |
search_key = self._search_key(key) |
|
841 |
prefix = search_key[:split_at] |
|
842 |
# TODO: Generally only 1 key can be exactly the right length,
|
|
843 |
# which means we can only have 1 key in the node pointed
|
|
844 |
# at by the 'prefix\0' key. We might want to consider
|
|
845 |
# folding it into the containing InternalNode rather than
|
|
846 |
# having a fixed length-1 node.
|
|
847 |
# Note this is probably not true for hash keys, as they
|
|
848 |
# may get a '\00' node anywhere, but won't have keys of
|
|
849 |
# different lengths.
|
|
850 |
if len(prefix) < split_at: |
|
851 |
prefix += '\x00'*(split_at - len(prefix)) |
|
852 |
if prefix not in result: |
|
853 |
node = LeafNode(search_key_func=self._search_key_func) |
|
854 |
node.set_maximum_size(self._maximum_size) |
|
855 |
node._key_width = self._key_width |
|
856 |
result[prefix] = node |
|
857 |
else: |
|
858 |
node = result[prefix] |
|
4413.5.4
by John Arbash Meinel
Change CHKMap.from_dict to create a LeafNode and split it. |
859 |
sub_prefix, node_details = node.map(store, key, value) |
860 |
if len(node_details) > 1: |
|
861 |
if prefix != sub_prefix: |
|
862 |
# This node has been split and is now found via a different
|
|
863 |
# path
|
|
864 |
result.pop(prefix) |
|
865 |
new_node = InternalNode(sub_prefix, |
|
866 |
search_key_func=self._search_key_func) |
|
867 |
new_node.set_maximum_size(self._maximum_size) |
|
868 |
new_node._key_width = self._key_width |
|
869 |
for split, node in node_details: |
|
870 |
new_node.add_node(split, node) |
|
871 |
result[prefix] = new_node |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
872 |
return common_prefix, result.items() |
873 |
||
874 |
def map(self, store, key, value): |
|
875 |
"""Map key to value."""
|
|
876 |
if key in self._items: |
|
877 |
self._raw_size -= self._key_value_len(key, self._items[key]) |
|
878 |
self._len -= 1 |
|
879 |
self._key = None |
|
880 |
if self._map_no_split(key, value): |
|
881 |
return self._split(store) |
|
882 |
else: |
|
883 |
if self._search_prefix is _unknown: |
|
884 |
raise AssertionError('%r must be known' % self._search_prefix) |
|
885 |
return self._search_prefix, [("", self)] |
|
886 |
||
5169.3.1
by Martin
Make LeafNode._serialise_key a static method on the class rather than the instance |
887 |
_serialise_key = '\x00'.join |
888 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
889 |
def serialise(self, store): |
890 |
"""Serialise the LeafNode to store.
|
|
891 |
||
892 |
:param store: A VersionedFiles honouring the CHK extensions.
|
|
893 |
:return: An iterable of the keys inserted by this operation.
|
|
894 |
"""
|
|
895 |
lines = ["chkleaf:\n"] |
|
896 |
lines.append("%d\n" % self._maximum_size) |
|
897 |
lines.append("%d\n" % self._key_width) |
|
898 |
lines.append("%d\n" % self._len) |
|
899 |
if self._common_serialised_prefix is None: |
|
900 |
lines.append('\n') |
|
901 |
if len(self._items) != 0: |
|
902 |
raise AssertionError('If _common_serialised_prefix is None' |
|
903 |
' we should have no items') |
|
904 |
else: |
|
905 |
lines.append('%s\n' % (self._common_serialised_prefix,)) |
|
906 |
prefix_len = len(self._common_serialised_prefix) |
|
907 |
for key, value in sorted(self._items.items()): |
|
908 |
# Always add a final newline
|
|
909 |
value_lines = osutils.chunks_to_lines([value + '\n']) |
|
910 |
serialized = "%s\x00%s\n" % (self._serialise_key(key), |
|
911 |
len(value_lines)) |
|
912 |
if not serialized.startswith(self._common_serialised_prefix): |
|
913 |
raise AssertionError('We thought the common prefix was %r' |
|
914 |
' but entry %r does not have it in common' |
|
915 |
% (self._common_serialised_prefix, serialized)) |
|
916 |
lines.append(serialized[prefix_len:]) |
|
917 |
lines.extend(value_lines) |
|
918 |
sha1, _, _ = store.add_lines((None,), (), lines) |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
919 |
self._key = StaticTuple("sha1:" + sha1,).intern() |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
920 |
bytes = ''.join(lines) |
921 |
if len(bytes) != self._current_size(): |
|
922 |
raise AssertionError('Invalid _current_size') |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
923 |
_get_cache().add(self._key, bytes) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
924 |
return [self._key] |
925 |
||
926 |
def refs(self): |
|
927 |
"""Return the references to other CHK's held by this node."""
|
|
928 |
return [] |
|
929 |
||
930 |
def _compute_search_prefix(self): |
|
931 |
"""Determine the common search prefix for all keys in this node.
|
|
932 |
||
933 |
:return: A bytestring of the longest search key prefix that is
|
|
934 |
unique within this node.
|
|
935 |
"""
|
|
936 |
search_keys = [self._search_key_func(key) for key in self._items] |
|
937 |
self._search_prefix = self.common_prefix_for_keys(search_keys) |
|
938 |
return self._search_prefix |
|
939 |
||
940 |
def _are_search_keys_identical(self): |
|
941 |
"""Check to see if the search keys for all entries are the same.
|
|
942 |
||
943 |
When using a hash as the search_key it is possible for non-identical
|
|
944 |
keys to collide. If that happens enough, we may try overflow a
|
|
945 |
LeafNode, but as all are collisions, we must not split.
|
|
946 |
"""
|
|
947 |
common_search_key = None |
|
948 |
for key in self._items: |
|
949 |
search_key = self._search_key(key) |
|
950 |
if common_search_key is None: |
|
951 |
common_search_key = search_key |
|
952 |
elif search_key != common_search_key: |
|
953 |
return False |
|
954 |
return True |
|
955 |
||
956 |
def _compute_serialised_prefix(self): |
|
957 |
"""Determine the common prefix for serialised keys in this node.
|
|
958 |
||
959 |
:return: A bytestring of the longest serialised key prefix that is
|
|
960 |
unique within this node.
|
|
961 |
"""
|
|
962 |
serialised_keys = [self._serialise_key(key) for key in self._items] |
|
963 |
self._common_serialised_prefix = self.common_prefix_for_keys( |
|
964 |
serialised_keys) |
|
3735.19.1
by Ian Clatworthy
CHKMap cleanups |
965 |
return self._common_serialised_prefix |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
966 |
|
967 |
def unmap(self, store, key): |
|
968 |
"""Unmap key from the node."""
|
|
969 |
try: |
|
970 |
self._raw_size -= self._key_value_len(key, self._items[key]) |
|
971 |
except KeyError: |
|
972 |
trace.mutter("key %s not found in %r", key, self._items) |
|
973 |
raise
|
|
974 |
self._len -= 1 |
|
975 |
del self._items[key] |
|
976 |
self._key = None |
|
977 |
# Recompute from scratch
|
|
978 |
self._compute_search_prefix() |
|
979 |
self._compute_serialised_prefix() |
|
980 |
return self |
|
981 |
||
982 |
||
983 |
class InternalNode(Node): |
|
984 |
"""A node that contains references to other nodes.
|
|
985 |
||
986 |
An InternalNode is responsible for mapping search key prefixes to child
|
|
987 |
nodes.
|
|
988 |
||
989 |
:ivar _items: serialised_key => node dictionary. node may be a tuple,
|
|
990 |
LeafNode or InternalNode.
|
|
991 |
"""
|
|
992 |
||
4759.1.2
by John Arbash Meinel
Change CHKMap to use __slots__ |
993 |
__slots__ = ('_node_width',) |
994 |
||
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
995 |
def __init__(self, prefix='', search_key_func=None): |
996 |
Node.__init__(self) |
|
997 |
# The size of an internalnode with default values and no children.
|
|
998 |
# How many octets key prefixes within this node are.
|
|
999 |
self._node_width = 0 |
|
1000 |
self._search_prefix = prefix |
|
1001 |
if search_key_func is None: |
|
1002 |
self._search_key_func = _search_key_plain |
|
1003 |
else: |
|
1004 |
self._search_key_func = search_key_func |
|
1005 |
||
1006 |
def add_node(self, prefix, node): |
|
1007 |
"""Add a child node with prefix prefix, and node node.
|
|
1008 |
||
1009 |
:param prefix: The search key prefix for node.
|
|
1010 |
:param node: The node being added.
|
|
1011 |
"""
|
|
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1012 |
if self._search_prefix is None: |
1013 |
raise AssertionError("_search_prefix should not be None") |
|
1014 |
if not prefix.startswith(self._search_prefix): |
|
1015 |
raise AssertionError("prefixes mismatch: %s must start with %s" |
|
1016 |
% (prefix,self._search_prefix)) |
|
1017 |
if len(prefix) != len(self._search_prefix) + 1: |
|
1018 |
raise AssertionError("prefix wrong length: len(%s) is not %d" % |
|
1019 |
(prefix, len(self._search_prefix) + 1)) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1020 |
self._len += len(node) |
1021 |
if not len(self._items): |
|
1022 |
self._node_width = len(prefix) |
|
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1023 |
if self._node_width != len(self._search_prefix) + 1: |
1024 |
raise AssertionError("node width mismatch: %d is not %d" % |
|
1025 |
(self._node_width, len(self._search_prefix) + 1)) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1026 |
self._items[prefix] = node |
1027 |
self._key = None |
|
1028 |
||
1029 |
def _current_size(self): |
|
1030 |
"""Answer the current serialised size of this node."""
|
|
1031 |
return (self._raw_size + len(str(self._len)) + len(str(self._key_width)) + |
|
1032 |
len(str(self._maximum_size))) |
|
1033 |
||
1034 |
@classmethod
|
|
1035 |
def deserialise(klass, bytes, key, search_key_func=None): |
|
1036 |
"""Deserialise bytes to an InternalNode, with key key.
|
|
1037 |
||
1038 |
:param bytes: The bytes of the node.
|
|
1039 |
:param key: The key that the serialised node has.
|
|
1040 |
:return: An InternalNode instance.
|
|
1041 |
"""
|
|
4668.3.1
by John Arbash Meinel
Fix bug #471193, allow tuples into the CHK code. |
1042 |
key = static_tuple.expect_static_tuple(key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1043 |
return _deserialise_internal_node(bytes, key, |
1044 |
search_key_func=search_key_func) |
|
1045 |
||
1046 |
def iteritems(self, store, key_filter=None): |
|
1047 |
for node, node_filter in self._iter_nodes(store, key_filter=key_filter): |
|
1048 |
for item in node.iteritems(store, key_filter=node_filter): |
|
1049 |
yield item |
|
1050 |
||
1051 |
def _iter_nodes(self, store, key_filter=None, batch_size=None): |
|
1052 |
"""Iterate over node objects which match key_filter.
|
|
1053 |
||
1054 |
:param store: A store to use for accessing content.
|
|
1055 |
:param key_filter: A key filter to filter nodes. Only nodes that might
|
|
1056 |
contain a key in key_filter will be returned.
|
|
1057 |
:param batch_size: If not None, then we will return the nodes that had
|
|
1058 |
to be read using get_record_stream in batches, rather than reading
|
|
1059 |
them all at once.
|
|
1060 |
:return: An iterable of nodes. This function does not have to be fully
|
|
1061 |
consumed. (There will be no pending I/O when items are being returned.)
|
|
1062 |
"""
|
|
1063 |
# Map from chk key ('sha1:...',) to (prefix, key_filter)
|
|
1064 |
# prefix is the key in self._items to use, key_filter is the key_filter
|
|
1065 |
# entries that would match this node
|
|
1066 |
keys = {} |
|
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1067 |
shortcut = False |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1068 |
if key_filter is None: |
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1069 |
# yielding all nodes, yield whatever we have, and queue up a read
|
1070 |
# for whatever we are missing
|
|
1071 |
shortcut = True |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1072 |
for prefix, node in self._items.iteritems(): |
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1073 |
if node.__class__ is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1074 |
keys[node] = (prefix, None) |
1075 |
else: |
|
1076 |
yield node, None |
|
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1077 |
elif len(key_filter) == 1: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1078 |
# Technically, this path could also be handled by the first check
|
1079 |
# in 'self._node_width' in length_filters. However, we can handle
|
|
1080 |
# this case without spending any time building up the
|
|
1081 |
# prefix_to_keys, etc state.
|
|
1082 |
||
1083 |
# This is a bit ugly, but TIMEIT showed it to be by far the fastest
|
|
1084 |
# 0.626us list(key_filter)[0]
|
|
1085 |
# is a func() for list(), 2 mallocs, and a getitem
|
|
1086 |
# 0.489us [k for k in key_filter][0]
|
|
1087 |
# still has the mallocs, avoids the func() call
|
|
1088 |
# 0.350us iter(key_filter).next()
|
|
1089 |
# has a func() call, and mallocs an iterator
|
|
1090 |
# 0.125us for key in key_filter: pass
|
|
1091 |
# no func() overhead, might malloc an iterator
|
|
1092 |
# 0.105us for key in key_filter: break
|
|
1093 |
# no func() overhead, might malloc an iterator, probably
|
|
1094 |
# avoids checking an 'else' clause as part of the for
|
|
1095 |
for key in key_filter: |
|
1096 |
break
|
|
1097 |
search_prefix = self._search_prefix_filter(key) |
|
1098 |
if len(search_prefix) == self._node_width: |
|
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1099 |
# This item will match exactly, so just do a dict lookup, and
|
1100 |
# see what we can return
|
|
1101 |
shortcut = True |
|
1102 |
try: |
|
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1103 |
node = self._items[search_prefix] |
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1104 |
except KeyError: |
1105 |
# A given key can only match 1 child node, if it isn't
|
|
1106 |
# there, then we can just return nothing
|
|
1107 |
return
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1108 |
if node.__class__ is StaticTuple: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1109 |
keys[node] = (search_prefix, [key]) |
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1110 |
else: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1111 |
# This is loaded, and the only thing that can match,
|
1112 |
# return
|
|
1113 |
yield node, [key] |
|
1114 |
return
|
|
4413.4.1
by John Arbash Meinel
Add a shortcut for the case when we are searching for a single full-width key. |
1115 |
if not shortcut: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1116 |
# First, convert all keys into a list of search prefixes
|
1117 |
# Aggregate common prefixes, and track the keys they come from
|
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1118 |
prefix_to_keys = {} |
1119 |
length_filters = {} |
|
1120 |
for key in key_filter: |
|
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1121 |
search_prefix = self._search_prefix_filter(key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1122 |
length_filter = length_filters.setdefault( |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1123 |
len(search_prefix), set()) |
1124 |
length_filter.add(search_prefix) |
|
1125 |
prefix_to_keys.setdefault(search_prefix, []).append(key) |
|
1126 |
||
1127 |
if (self._node_width in length_filters |
|
1128 |
and len(length_filters) == 1): |
|
1129 |
# all of the search prefixes match exactly _node_width. This
|
|
1130 |
# means that everything is an exact match, and we can do a
|
|
1131 |
# lookup into self._items, rather than iterating over the items
|
|
1132 |
# dict.
|
|
1133 |
search_prefixes = length_filters[self._node_width] |
|
1134 |
for search_prefix in search_prefixes: |
|
1135 |
try: |
|
1136 |
node = self._items[search_prefix] |
|
1137 |
except KeyError: |
|
1138 |
# We can ignore this one
|
|
1139 |
continue
|
|
1140 |
node_key_filter = prefix_to_keys[search_prefix] |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1141 |
if node.__class__ is StaticTuple: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1142 |
keys[node] = (search_prefix, node_key_filter) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1143 |
else: |
1144 |
yield node, node_key_filter |
|
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1145 |
else: |
1146 |
# The slow way. We walk every item in self._items, and check to
|
|
1147 |
# see if there are any matches
|
|
1148 |
length_filters = length_filters.items() |
|
1149 |
for prefix, node in self._items.iteritems(): |
|
1150 |
node_key_filter = [] |
|
1151 |
for length, length_filter in length_filters: |
|
1152 |
sub_prefix = prefix[:length] |
|
1153 |
if sub_prefix in length_filter: |
|
1154 |
node_key_filter.extend(prefix_to_keys[sub_prefix]) |
|
1155 |
if node_key_filter: # this key matched something, yield it |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1156 |
if node.__class__ is StaticTuple: |
4413.4.2
by John Arbash Meinel
Rewrite the shortcuts. |
1157 |
keys[node] = (prefix, node_key_filter) |
1158 |
else: |
|
1159 |
yield node, node_key_filter |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1160 |
if keys: |
1161 |
# Look in the page cache for some more bytes
|
|
1162 |
found_keys = set() |
|
1163 |
for key in keys: |
|
1164 |
try: |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
1165 |
bytes = _get_cache()[key] |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1166 |
except KeyError: |
1167 |
continue
|
|
1168 |
else: |
|
1169 |
node = _deserialise(bytes, key, |
|
1170 |
search_key_func=self._search_key_func) |
|
1171 |
prefix, node_key_filter = keys[key] |
|
1172 |
self._items[prefix] = node |
|
1173 |
found_keys.add(key) |
|
1174 |
yield node, node_key_filter |
|
1175 |
for key in found_keys: |
|
1176 |
del keys[key] |
|
1177 |
if keys: |
|
1178 |
# demand load some pages.
|
|
1179 |
if batch_size is None: |
|
1180 |
# Read all the keys in
|
|
1181 |
batch_size = len(keys) |
|
1182 |
key_order = list(keys) |
|
1183 |
for batch_start in range(0, len(key_order), batch_size): |
|
1184 |
batch = key_order[batch_start:batch_start + batch_size] |
|
1185 |
# We have to fully consume the stream so there is no pending
|
|
1186 |
# I/O, so we buffer the nodes for now.
|
|
1187 |
stream = store.get_record_stream(batch, 'unordered', True) |
|
1188 |
node_and_filters = [] |
|
1189 |
for record in stream: |
|
1190 |
bytes = record.get_bytes_as('fulltext') |
|
1191 |
node = _deserialise(bytes, record.key, |
|
1192 |
search_key_func=self._search_key_func) |
|
1193 |
prefix, node_key_filter = keys[record.key] |
|
1194 |
node_and_filters.append((node, node_key_filter)) |
|
1195 |
self._items[prefix] = node |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
1196 |
_get_cache().add(record.key, bytes) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1197 |
for info in node_and_filters: |
1198 |
yield info |
|
1199 |
||
1200 |
def map(self, store, key, value): |
|
1201 |
"""Map key to value."""
|
|
1202 |
if not len(self._items): |
|
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
1203 |
raise AssertionError("can't map in an empty InternalNode.") |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1204 |
search_key = self._search_key(key) |
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1205 |
if self._node_width != len(self._search_prefix) + 1: |
1206 |
raise AssertionError("node width mismatch: %d is not %d" % |
|
1207 |
(self._node_width, len(self._search_prefix) + 1)) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1208 |
if not search_key.startswith(self._search_prefix): |
1209 |
# This key doesn't fit in this index, so we need to split at the
|
|
1210 |
# point where it would fit, insert self into that internal node,
|
|
1211 |
# and then map this key into that node.
|
|
1212 |
new_prefix = self.common_prefix(self._search_prefix, |
|
1213 |
search_key) |
|
1214 |
new_parent = InternalNode(new_prefix, |
|
1215 |
search_key_func=self._search_key_func) |
|
1216 |
new_parent.set_maximum_size(self._maximum_size) |
|
1217 |
new_parent._key_width = self._key_width |
|
1218 |
new_parent.add_node(self._search_prefix[:len(new_prefix)+1], |
|
1219 |
self) |
|
1220 |
return new_parent.map(store, key, value) |
|
1221 |
children = [node for node, _ |
|
1222 |
in self._iter_nodes(store, key_filter=[key])] |
|
1223 |
if children: |
|
1224 |
child = children[0] |
|
1225 |
else: |
|
1226 |
# new child needed:
|
|
1227 |
child = self._new_child(search_key, LeafNode) |
|
1228 |
old_len = len(child) |
|
1229 |
if type(child) is LeafNode: |
|
1230 |
old_size = child._current_size() |
|
1231 |
else: |
|
1232 |
old_size = None |
|
1233 |
prefix, node_details = child.map(store, key, value) |
|
1234 |
if len(node_details) == 1: |
|
1235 |
# child may have shrunk, or might be a new node
|
|
1236 |
child = node_details[0][1] |
|
1237 |
self._len = self._len - old_len + len(child) |
|
1238 |
self._items[search_key] = child |
|
1239 |
self._key = None |
|
1240 |
new_node = self |
|
1241 |
if type(child) is LeafNode: |
|
3735.2.123
by Ian Clatworthy
only check for remap if changes are interesting in size |
1242 |
if old_size is None: |
1243 |
# The old node was an InternalNode which means it has now
|
|
1244 |
# collapsed, so we need to check if it will chain to a
|
|
1245 |
# collapse at this level.
|
|
1246 |
trace.mutter("checking remap as InternalNode -> LeafNode") |
|
1247 |
new_node = self._check_remap(store) |
|
1248 |
else: |
|
1249 |
# If the LeafNode has shrunk in size, we may want to run
|
|
1250 |
# a remap check. Checking for a remap is expensive though
|
|
1251 |
# and the frequency of a successful remap is very low.
|
|
1252 |
# Shrinkage by small amounts is common, so we only do the
|
|
1253 |
# remap check if the new_size is low or the shrinkage
|
|
1254 |
# amount is over a configurable limit.
|
|
1255 |
new_size = child._current_size() |
|
1256 |
shrinkage = old_size - new_size |
|
1257 |
if (shrinkage > 0 and new_size < _INTERESTING_NEW_SIZE |
|
1258 |
or shrinkage > _INTERESTING_SHRINKAGE_LIMIT): |
|
1259 |
trace.mutter( |
|
1260 |
"checking remap as size shrunk by %d to be %d", |
|
1261 |
shrinkage, new_size) |
|
1262 |
new_node = self._check_remap(store) |
|
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1263 |
if new_node._search_prefix is None: |
1264 |
raise AssertionError("_search_prefix should not be None") |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1265 |
return new_node._search_prefix, [('', new_node)] |
1266 |
# child has overflown - create a new intermediate node.
|
|
1267 |
# XXX: This is where we might want to try and expand our depth
|
|
1268 |
# to refer to more bytes of every child (which would give us
|
|
1269 |
# multiple pointers to child nodes, but less intermediate nodes)
|
|
1270 |
child = self._new_child(search_key, InternalNode) |
|
1271 |
child._search_prefix = prefix |
|
1272 |
for split, node in node_details: |
|
1273 |
child.add_node(split, node) |
|
1274 |
self._len = self._len - old_len + len(child) |
|
1275 |
self._key = None |
|
1276 |
return self._search_prefix, [("", self)] |
|
1277 |
||
1278 |
def _new_child(self, search_key, klass): |
|
1279 |
"""Create a new child node of type klass."""
|
|
1280 |
child = klass() |
|
1281 |
child.set_maximum_size(self._maximum_size) |
|
1282 |
child._key_width = self._key_width |
|
1283 |
child._search_key_func = self._search_key_func |
|
1284 |
self._items[search_key] = child |
|
1285 |
return child |
|
1286 |
||
1287 |
def serialise(self, store): |
|
1288 |
"""Serialise the node to store.
|
|
1289 |
||
1290 |
:param store: A VersionedFiles honouring the CHK extensions.
|
|
1291 |
:return: An iterable of the keys inserted by this operation.
|
|
1292 |
"""
|
|
1293 |
for node in self._items.itervalues(): |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1294 |
if type(node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1295 |
# Never deserialised.
|
1296 |
continue
|
|
1297 |
if node._key is not None: |
|
1298 |
# Never altered
|
|
1299 |
continue
|
|
1300 |
for key in node.serialise(store): |
|
1301 |
yield key |
|
1302 |
lines = ["chknode:\n"] |
|
1303 |
lines.append("%d\n" % self._maximum_size) |
|
1304 |
lines.append("%d\n" % self._key_width) |
|
1305 |
lines.append("%d\n" % self._len) |
|
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1306 |
if self._search_prefix is None: |
1307 |
raise AssertionError("_search_prefix should not be None") |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1308 |
lines.append('%s\n' % (self._search_prefix,)) |
1309 |
prefix_len = len(self._search_prefix) |
|
1310 |
for prefix, node in sorted(self._items.items()): |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1311 |
if type(node) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1312 |
key = node[0] |
1313 |
else: |
|
1314 |
key = node._key[0] |
|
1315 |
serialised = "%s\x00%s\n" % (prefix, key) |
|
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1316 |
if not serialised.startswith(self._search_prefix): |
1317 |
raise AssertionError("prefixes mismatch: %s must start with %s" |
|
1318 |
% (serialised, self._search_prefix)) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1319 |
lines.append(serialised[prefix_len:]) |
1320 |
sha1, _, _ = store.add_lines((None,), (), lines) |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1321 |
self._key = StaticTuple("sha1:" + sha1,).intern() |
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
1322 |
_get_cache().add(self._key, ''.join(lines)) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1323 |
yield self._key |
1324 |
||
1325 |
def _search_key(self, key): |
|
1326 |
"""Return the serialised key for key in this node."""
|
|
1327 |
# search keys are fixed width. All will be self._node_width wide, so we
|
|
1328 |
# pad as necessary.
|
|
1329 |
return (self._search_key_func(key) + '\x00'*self._node_width)[:self._node_width] |
|
1330 |
||
1331 |
def _search_prefix_filter(self, key): |
|
1332 |
"""Serialise key for use as a prefix filter in iteritems."""
|
|
1333 |
return self._search_key_func(key)[:self._node_width] |
|
1334 |
||
1335 |
def _split(self, offset): |
|
1336 |
"""Split this node into smaller nodes starting at offset.
|
|
1337 |
||
1338 |
:param offset: The offset to start the new child nodes at.
|
|
1339 |
:return: An iterable of (prefix, node) tuples. prefix is a byte
|
|
1340 |
prefix for reaching node.
|
|
1341 |
"""
|
|
1342 |
if offset >= self._node_width: |
|
1343 |
for node in self._items.values(): |
|
1344 |
for result in node._split(offset): |
|
1345 |
yield result |
|
1346 |
return
|
|
1347 |
for key, node in self._items.items(): |
|
1348 |
pass
|
|
1349 |
||
1350 |
def refs(self): |
|
1351 |
"""Return the references to other CHK's held by this node."""
|
|
1352 |
if self._key is None: |
|
1353 |
raise AssertionError("unserialised nodes have no refs.") |
|
1354 |
refs = [] |
|
1355 |
for value in self._items.itervalues(): |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1356 |
if type(value) is StaticTuple: |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1357 |
refs.append(value) |
1358 |
else: |
|
1359 |
refs.append(value.key()) |
|
1360 |
return refs |
|
1361 |
||
1362 |
def _compute_search_prefix(self, extra_key=None): |
|
1363 |
"""Return the unique key prefix for this node.
|
|
1364 |
||
1365 |
:return: A bytestring of the longest search key prefix that is
|
|
1366 |
unique within this node.
|
|
1367 |
"""
|
|
1368 |
self._search_prefix = self.common_prefix_for_keys(self._items) |
|
1369 |
return self._search_prefix |
|
1370 |
||
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
1371 |
def unmap(self, store, key, check_remap=True): |
5448.2.1
by Martin
Fix some "its" vs. "it's" spelling confusion in bzrlib code... also, ahem, a name in the NEWS file |
1372 |
"""Remove key from this node and its children."""
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1373 |
if not len(self._items): |
3735.2.126
by Ian Clatworthy
replace asserts in chk_map.py with AssertionErrors |
1374 |
raise AssertionError("can't unmap in an empty InternalNode.") |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1375 |
children = [node for node, _ |
1376 |
in self._iter_nodes(store, key_filter=[key])] |
|
1377 |
if children: |
|
1378 |
child = children[0] |
|
1379 |
else: |
|
1380 |
raise KeyError(key) |
|
1381 |
self._len -= 1 |
|
1382 |
unmapped = child.unmap(store, key) |
|
1383 |
self._key = None |
|
1384 |
search_key = self._search_key(key) |
|
1385 |
if len(unmapped) == 0: |
|
1386 |
# All child nodes are gone, remove the child:
|
|
1387 |
del self._items[search_key] |
|
1388 |
unmapped = None |
|
1389 |
else: |
|
1390 |
# Stash the returned node
|
|
1391 |
self._items[search_key] = unmapped |
|
1392 |
if len(self._items) == 1: |
|
1393 |
# this node is no longer needed:
|
|
1394 |
return self._items.values()[0] |
|
1395 |
if type(unmapped) is InternalNode: |
|
1396 |
return self |
|
3735.2.122
by Ian Clatworthy
don't check_remap on every unmap call in CHKMap.apply_delta() |
1397 |
if check_remap: |
1398 |
return self._check_remap(store) |
|
1399 |
else: |
|
1400 |
return self |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1401 |
|
1402 |
def _check_remap(self, store): |
|
1403 |
"""Check if all keys contained by children fit in a single LeafNode.
|
|
1404 |
||
1405 |
:param store: A store to use for reading more nodes
|
|
1406 |
:return: Either self, or a new LeafNode which should replace self.
|
|
1407 |
"""
|
|
1408 |
# Logic for how we determine when we need to rebuild
|
|
1409 |
# 1) Implicitly unmap() is removing a key which means that the child
|
|
1410 |
# nodes are going to be shrinking by some extent.
|
|
1411 |
# 2) If all children are LeafNodes, it is possible that they could be
|
|
1412 |
# combined into a single LeafNode, which can then completely replace
|
|
1413 |
# this internal node with a single LeafNode
|
|
1414 |
# 3) If *one* child is an InternalNode, we assume it has already done
|
|
1415 |
# all the work to determine that its children cannot collapse, and
|
|
1416 |
# we can then assume that those nodes *plus* the current nodes don't
|
|
1417 |
# have a chance of collapsing either.
|
|
1418 |
# So a very cheap check is to just say if 'unmapped' is an
|
|
1419 |
# InternalNode, we don't have to check further.
|
|
1420 |
||
1421 |
# TODO: Another alternative is to check the total size of all known
|
|
1422 |
# LeafNodes. If there is some formula we can use to determine the
|
|
1423 |
# final size without actually having to read in any more
|
|
1424 |
# children, it would be nice to have. However, we have to be
|
|
1425 |
# careful with stuff like nodes that pull out the common prefix
|
|
1426 |
# of each key, as adding a new key can change the common prefix
|
|
1427 |
# and cause size changes greater than the length of one key.
|
|
1428 |
# So for now, we just add everything to a new Leaf until it
|
|
1429 |
# splits, as we know that will give the right answer
|
|
1430 |
new_leaf = LeafNode(search_key_func=self._search_key_func) |
|
1431 |
new_leaf.set_maximum_size(self._maximum_size) |
|
1432 |
new_leaf._key_width = self._key_width |
|
1433 |
# A batch_size of 16 was chosen because:
|
|
1434 |
# a) In testing, a 4k page held 14 times. So if we have more than 16
|
|
1435 |
# leaf nodes we are unlikely to hold them in a single new leaf
|
|
1436 |
# node. This still allows for 1 round trip
|
|
1437 |
# b) With 16-way fan out, we can still do a single round trip
|
|
1438 |
# c) With 255-way fan out, we don't want to read all 255 and destroy
|
|
1439 |
# the page cache, just to determine that we really don't need it.
|
|
1440 |
for node, _ in self._iter_nodes(store, batch_size=16): |
|
1441 |
if type(node) is InternalNode: |
|
1442 |
# Without looking at any leaf nodes, we are sure
|
|
1443 |
return self |
|
1444 |
for key, value in node._items.iteritems(): |
|
1445 |
if new_leaf._map_no_split(key, value): |
|
1446 |
return self |
|
3735.2.123
by Ian Clatworthy
only check for remap if changes are interesting in size |
1447 |
trace.mutter("remap generated a new LeafNode") |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1448 |
return new_leaf |
1449 |
||
1450 |
||
1451 |
def _deserialise(bytes, key, search_key_func): |
|
1452 |
"""Helper for repositorydetails - convert bytes to a node."""
|
|
1453 |
if bytes.startswith("chkleaf:\n"): |
|
1454 |
node = LeafNode.deserialise(bytes, key, search_key_func=search_key_func) |
|
1455 |
elif bytes.startswith("chknode:\n"): |
|
1456 |
node = InternalNode.deserialise(bytes, key, |
|
1457 |
search_key_func=search_key_func) |
|
1458 |
else: |
|
1459 |
raise AssertionError("Unknown node type.") |
|
1460 |
return node |
|
1461 |
||
1462 |
||
4476.1.38
by John Arbash Meinel
Rename InterestingNodeIterator => CHKMapDifference, update tests. |
1463 |
class CHKMapDifference(object): |
1464 |
"""Iterate the stored pages and key,value pairs for (new - old).
|
|
1465 |
||
1466 |
This class provides a generator over the stored CHK pages and the
|
|
1467 |
(key, value) pairs that are in any of the new maps and not in any of the
|
|
1468 |
old maps.
|
|
1469 |
||
1470 |
Note that it may yield chk pages that are common (especially root nodes),
|
|
1471 |
but it won't yield (key,value) pairs that are common.
|
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1472 |
"""
|
1473 |
||
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1474 |
def __init__(self, store, new_root_keys, old_root_keys, |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1475 |
search_key_func, pb=None): |
4679.9.15
by John Arbash Meinel
Cleanup some code paths. Make _check_key a helper that can be used |
1476 |
# TODO: Should we add a StaticTuple barrier here? It would be nice to
|
1477 |
# force callers to use StaticTuple, because there will often be
|
|
1478 |
# lots of keys passed in here. And even if we cast it locally,
|
|
1479 |
# that just meanst that we will have *both* a StaticTuple and a
|
|
1480 |
# tuple() in memory, referring to the same object. (so a net
|
|
1481 |
# increase in memory, not a decrease.)
|
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1482 |
self._store = store |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1483 |
self._new_root_keys = new_root_keys |
1484 |
self._old_root_keys = old_root_keys |
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1485 |
self._pb = pb |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1486 |
# All uninteresting chks that we have seen. By the time they are added
|
1487 |
# here, they should be either fully ignored, or queued up for
|
|
1488 |
# processing
|
|
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1489 |
# TODO: This might grow to a large size if there are lots of merge
|
1490 |
# parents, etc. However, it probably doesn't scale to O(history)
|
|
1491 |
# like _processed_new_refs does.
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1492 |
self._all_old_chks = set(self._old_root_keys) |
1493 |
# All items that we have seen from the old_root_keys
|
|
1494 |
self._all_old_items = set() |
|
4476.1.32
by John Arbash Meinel
A few more updates. |
1495 |
# These are interesting items which were either read, or already in the
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1496 |
# interesting queue (so we don't need to walk them again)
|
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1497 |
# TODO: processed_new_refs becomes O(all_chks), consider switching to
|
1498 |
# SimpleSet here.
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1499 |
self._processed_new_refs = set() |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1500 |
self._search_key_func = search_key_func |
1501 |
||
4476.1.33
by John Arbash Meinel
Simpify the code a lot by ignoring the heapq stuff. |
1502 |
# The uninteresting and interesting nodes to be searched
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1503 |
self._old_queue = [] |
1504 |
self._new_queue = [] |
|
4476.1.34
by John Arbash Meinel
Major rework, simplify what is put into the queues. |
1505 |
# Holds the (key, value) items found when processing the root nodes,
|
1506 |
# waiting for the uninteresting nodes to be walked
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1507 |
self._new_item_queue = [] |
4476.1.17
by John Arbash Meinel
Start running all of the iter_interesting_nodes tests |
1508 |
self._state = None |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1509 |
|
1510 |
def _read_nodes_from_store(self, keys): |
|
4797.7.1
by Robert Collins
Introduce a threading.local to isolate the chk_map page cache from other threads. |
1511 |
# We chose not to use _get_cache(), because we think in
|
1512 |
# terms of records to be yielded. Also, we expect to touch each page
|
|
1513 |
# only 1 time during this code. (We may want to evaluate saving the
|
|
1514 |
# raw bytes into the page cache, which would allow a working tree
|
|
1515 |
# update after the fetch to not have to read the bytes again.)
|
|
4679.9.20
by John Arbash Meinel
as_st(items) saves about 800kB peak memory. |
1516 |
as_st = StaticTuple.from_sequence |
4476.1.12
by John Arbash Meinel
Start testing the new class. |
1517 |
stream = self._store.get_record_stream(keys, 'unordered', True) |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1518 |
for record in stream: |
1519 |
if self._pb is not None: |
|
1520 |
self._pb.tick() |
|
1521 |
if record.storage_kind == 'absent': |
|
4476.1.17
by John Arbash Meinel
Start running all of the iter_interesting_nodes tests |
1522 |
raise errors.NoSuchRevision(self._store, record.key) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1523 |
bytes = record.get_bytes_as('fulltext') |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1524 |
node = _deserialise(bytes, record.key, |
1525 |
search_key_func=self._search_key_func) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1526 |
if type(node) is InternalNode: |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1527 |
# Note we don't have to do node.refs() because we know that
|
1528 |
# there are no children that have been pushed into this node
|
|
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1529 |
# Note: Using as_st() here seemed to save 1.2MB, which would
|
1530 |
# indicate that we keep 100k prefix_refs around while
|
|
1531 |
# processing. They *should* be shorter lived than that...
|
|
1532 |
# It does cost us ~10s of processing time
|
|
1533 |
#prefix_refs = [as_st(item) for item in node._items.iteritems()]
|
|
1534 |
prefix_refs = node._items.items() |
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1535 |
items = [] |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1536 |
else: |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1537 |
prefix_refs = [] |
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1538 |
# Note: We don't use a StaticTuple here. Profiling showed a
|
1539 |
# minor memory improvement (0.8MB out of 335MB peak 0.2%)
|
|
1540 |
# But a significant slowdown (15s / 145s, or 10%)
|
|
1541 |
items = node._items.items() |
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1542 |
yield record, node, prefix_refs, items |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1543 |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1544 |
def _read_old_roots(self): |
1545 |
old_chks_to_enqueue = [] |
|
1546 |
all_old_chks = self._all_old_chks |
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1547 |
for record, node, prefix_refs, items in \ |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1548 |
self._read_nodes_from_store(self._old_root_keys): |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1549 |
# Uninteresting node
|
4476.1.34
by John Arbash Meinel
Major rework, simplify what is put into the queues. |
1550 |
prefix_refs = [p_r for p_r in prefix_refs |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1551 |
if p_r[1] not in all_old_chks] |
4476.1.34
by John Arbash Meinel
Major rework, simplify what is put into the queues. |
1552 |
new_refs = [p_r[1] for p_r in prefix_refs] |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1553 |
all_old_chks.update(new_refs) |
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1554 |
# TODO: This might be a good time to turn items into StaticTuple
|
1555 |
# instances and possibly intern them. However, this does not
|
|
1556 |
# impact 'initial branch' performance, so I'm not worrying
|
|
1557 |
# about this yet
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1558 |
self._all_old_items.update(items) |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1559 |
# Queue up the uninteresting references
|
1560 |
# Don't actually put them in the 'to-read' queue until we have
|
|
1561 |
# finished checking the interesting references
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1562 |
old_chks_to_enqueue.extend(prefix_refs) |
1563 |
return old_chks_to_enqueue |
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1564 |
|
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1565 |
def _enqueue_old(self, new_prefixes, old_chks_to_enqueue): |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1566 |
# At this point, we have read all the uninteresting and interesting
|
1567 |
# items, so we can queue up the uninteresting stuff, knowing that we've
|
|
1568 |
# handled the interesting ones
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1569 |
for prefix, ref in old_chks_to_enqueue: |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1570 |
not_interesting = True |
1571 |
for i in xrange(len(prefix), 0, -1): |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1572 |
if prefix[:i] in new_prefixes: |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1573 |
not_interesting = False |
1574 |
break
|
|
1575 |
if not_interesting: |
|
1576 |
# This prefix is not part of the remaining 'interesting set'
|
|
1577 |
continue
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1578 |
self._old_queue.append(ref) |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1579 |
|
1580 |
def _read_all_roots(self): |
|
1581 |
"""Read the root pages.
|
|
1582 |
||
1583 |
This is structured as a generator, so that the root records can be
|
|
1584 |
yielded up to whoever needs them without any buffering.
|
|
1585 |
"""
|
|
1586 |
# This is the bootstrap phase
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1587 |
if not self._old_root_keys: |
1588 |
# With no old_root_keys we can just shortcut and be ready
|
|
1589 |
# for _flush_new_queue
|
|
1590 |
self._new_queue = list(self._new_root_keys) |
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1591 |
return
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1592 |
old_chks_to_enqueue = self._read_old_roots() |
4476.1.12
by John Arbash Meinel
Start testing the new class. |
1593 |
# filter out any root keys that are already known to be uninteresting
|
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1594 |
new_keys = set(self._new_root_keys).difference(self._all_old_chks) |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1595 |
# These are prefixes that are present in new_keys that we are
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1596 |
# thinking to yield
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1597 |
new_prefixes = set() |
4476.1.18
by John Arbash Meinel
Tracked it down. |
1598 |
# We are about to yield all of these, so we don't want them getting
|
1599 |
# added a second time
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1600 |
processed_new_refs = self._processed_new_refs |
1601 |
processed_new_refs.update(new_keys) |
|
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1602 |
for record, node, prefix_refs, items in \ |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1603 |
self._read_nodes_from_store(new_keys): |
4476.1.5
by John Arbash Meinel
Start working on a new InterestingNodeIterator class. |
1604 |
# At this level, we now know all the uninteresting references
|
4476.1.35
by John Arbash Meinel
Change some of the inner loop workings into list comprehensions. |
1605 |
# So we filter and queue up whatever is remaining
|
1606 |
prefix_refs = [p_r for p_r in prefix_refs |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1607 |
if p_r[1] not in self._all_old_chks |
1608 |
and p_r[1] not in processed_new_refs] |
|
4476.1.35
by John Arbash Meinel
Change some of the inner loop workings into list comprehensions. |
1609 |
refs = [p_r[1] for p_r in prefix_refs] |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1610 |
new_prefixes.update([p_r[0] for p_r in prefix_refs]) |
1611 |
self._new_queue.extend(refs) |
|
4476.1.34
by John Arbash Meinel
Major rework, simplify what is put into the queues. |
1612 |
# TODO: We can potentially get multiple items here, however the
|
1613 |
# current design allows for this, as callers will do the work
|
|
1614 |
# to make the results unique. We might profile whether we
|
|
1615 |
# gain anything by ensuring unique return values for items
|
|
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1616 |
# TODO: This might be a good time to cast to StaticTuple, as
|
1617 |
# self._new_item_queue will hold the contents of multiple
|
|
1618 |
# records for an extended lifetime
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1619 |
new_items = [item for item in items |
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1620 |
if item not in self._all_old_items] |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1621 |
self._new_item_queue.extend(new_items) |
1622 |
new_prefixes.update([self._search_key_func(item[0]) |
|
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1623 |
for item in new_items]) |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1624 |
processed_new_refs.update(refs) |
4476.1.13
by John Arbash Meinel
Test that _read_all_roots does what is expected |
1625 |
yield record |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1626 |
# For new_prefixes we have the full length prefixes queued up.
|
4476.1.35
by John Arbash Meinel
Change some of the inner loop workings into list comprehensions. |
1627 |
# However, we also need possible prefixes. (If we have a known ref to
|
1628 |
# 'ab', then we also need to include 'a'.) So expand the
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1629 |
# new_prefixes to include all shorter prefixes
|
1630 |
for prefix in list(new_prefixes): |
|
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1631 |
new_prefixes.update([prefix[:i] for i in xrange(1, len(prefix))]) |
1632 |
self._enqueue_old(new_prefixes, old_chks_to_enqueue) |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1633 |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1634 |
def _flush_new_queue(self): |
4476.1.27
by John Arbash Meinel
Rewrite of _flush_interesting_queue |
1635 |
# No need to maintain the heap invariant anymore, just pull things out
|
1636 |
# and process them
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1637 |
refs = set(self._new_queue) |
1638 |
self._new_queue = [] |
|
4476.1.31
by John Arbash Meinel
streamline the _flush_interesting_queue a bit. |
1639 |
# First pass, flush all interesting items and convert to using direct refs
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1640 |
all_old_chks = self._all_old_chks |
1641 |
processed_new_refs = self._processed_new_refs |
|
1642 |
all_old_items = self._all_old_items |
|
1643 |
new_items = [item for item in self._new_item_queue |
|
4476.1.40
by John Arbash Meinel
cleanup indentation. |
1644 |
if item not in all_old_items] |
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1645 |
self._new_item_queue = [] |
1646 |
if new_items: |
|
1647 |
yield None, new_items |
|
1648 |
refs = refs.difference(all_old_chks) |
|
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1649 |
processed_new_refs.update(refs) |
4476.1.31
by John Arbash Meinel
streamline the _flush_interesting_queue a bit. |
1650 |
while refs: |
4679.9.24
by John Arbash Meinel
Note a memory savings with a special set |
1651 |
# TODO: Using a SimpleSet for self._processed_new_refs and
|
1652 |
# saved as much as 10MB of peak memory. However, it requires
|
|
1653 |
# implementing a non-pyrex version.
|
|
4476.1.31
by John Arbash Meinel
streamline the _flush_interesting_queue a bit. |
1654 |
next_refs = set() |
1655 |
next_refs_update = next_refs.update |
|
1656 |
# Inlining _read_nodes_from_store improves 'bzr branch bzr.dev'
|
|
1657 |
# from 1m54s to 1m51s. Consider it.
|
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1658 |
for record, _, p_refs, items in self._read_nodes_from_store(refs): |
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1659 |
if all_old_items: |
1660 |
# using the 'if' check saves about 145s => 141s, when
|
|
1661 |
# streaming initial branch of Launchpad data.
|
|
1662 |
items = [item for item in items |
|
1663 |
if item not in all_old_items] |
|
4476.1.27
by John Arbash Meinel
Rewrite of _flush_interesting_queue |
1664 |
yield record, items |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1665 |
next_refs_update([p_r[1] for p_r in p_refs]) |
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1666 |
del p_refs |
1667 |
# set1.difference(set/dict) walks all of set1, and checks if it
|
|
1668 |
# exists in 'other'.
|
|
1669 |
# set1.difference(iterable) walks all of iterable, and does a
|
|
1670 |
# 'difference_update' on a clone of set1. Pick wisely based on the
|
|
1671 |
# expected sizes of objects.
|
|
1672 |
# in our case it is expected that 'new_refs' will always be quite
|
|
1673 |
# small.
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1674 |
next_refs = next_refs.difference(all_old_chks) |
1675 |
next_refs = next_refs.difference(processed_new_refs) |
|
1676 |
processed_new_refs.update(next_refs) |
|
4476.1.31
by John Arbash Meinel
streamline the _flush_interesting_queue a bit. |
1677 |
refs = next_refs |
4476.1.17
by John Arbash Meinel
Start running all of the iter_interesting_nodes tests |
1678 |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1679 |
def _process_next_old(self): |
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1680 |
# Since we don't filter uninteresting any further than during
|
1681 |
# _read_all_roots, process the whole queue in a single pass.
|
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1682 |
refs = self._old_queue |
1683 |
self._old_queue = [] |
|
1684 |
all_old_chks = self._all_old_chks |
|
4476.1.32
by John Arbash Meinel
A few more updates. |
1685 |
for record, _, prefix_refs, items in self._read_nodes_from_store(refs): |
4679.9.23
by John Arbash Meinel
Mostly TODO entries. |
1686 |
# TODO: Use StaticTuple here?
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1687 |
self._all_old_items.update(items) |
1688 |
refs = [r for _,r in prefix_refs if r not in all_old_chks] |
|
1689 |
self._old_queue.extend(refs) |
|
1690 |
all_old_chks.update(refs) |
|
4476.1.17
by John Arbash Meinel
Start running all of the iter_interesting_nodes tests |
1691 |
|
1692 |
def _process_queues(self): |
|
4476.1.39
by John Arbash Meinel
Rename interesting => new, uninteresting => old |
1693 |
while self._old_queue: |
1694 |
self._process_next_old() |
|
1695 |
return self._flush_new_queue() |
|
4476.1.17
by John Arbash Meinel
Start running all of the iter_interesting_nodes tests |
1696 |
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1697 |
def process(self): |
1698 |
for record in self._read_all_roots(): |
|
1699 |
yield record, [] |
|
1700 |
for record, items in self._process_queues(): |
|
1701 |
yield record, items |
|
1702 |
||
4476.1.25
by John Arbash Meinel
A bit more testing. |
1703 |
|
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1704 |
def iter_interesting_nodes(store, interesting_root_keys, |
1705 |
uninteresting_root_keys, pb=None): |
|
1706 |
"""Given root keys, find interesting nodes.
|
|
1707 |
||
1708 |
Evaluate nodes referenced by interesting_root_keys. Ones that are also
|
|
1709 |
referenced from uninteresting_root_keys are not considered interesting.
|
|
1710 |
||
1711 |
:param interesting_root_keys: keys which should be part of the
|
|
1712 |
"interesting" nodes (which will be yielded)
|
|
1713 |
:param uninteresting_root_keys: keys which should be filtered out of the
|
|
1714 |
result set.
|
|
1715 |
:return: Yield
|
|
1716 |
(interesting record, {interesting key:values})
|
|
1717 |
"""
|
|
4476.1.38
by John Arbash Meinel
Rename InterestingNodeIterator => CHKMapDifference, update tests. |
1718 |
iterator = CHKMapDifference(store, interesting_root_keys, |
1719 |
uninteresting_root_keys, |
|
1720 |
search_key_func=store._search_key_func, |
|
1721 |
pb=pb) |
|
4476.1.37
by John Arbash Meinel
Some small code cleanup passes |
1722 |
return iterator.process() |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1723 |
|
1724 |
||
1725 |
try: |
|
1726 |
from bzrlib._chk_map_pyx import ( |
|
5218.2.1
by John Arbash Meinel
Implement a compiled extension for parsing the text key out of a CHKInventory value. |
1727 |
_bytes_to_text_key, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1728 |
_search_key_16, |
1729 |
_search_key_255, |
|
1730 |
_deserialise_leaf_node, |
|
1731 |
_deserialise_internal_node, |
|
1732 |
)
|
|
4574.3.6
by Martin Pool
More warnings when failing to load extensions |
1733 |
except ImportError, e: |
4574.3.8
by Martin Pool
Only mutter extension load errors when they occur, and record for later |
1734 |
osutils.failed_to_load_extension(e) |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1735 |
from bzrlib._chk_map_py import ( |
5218.2.1
by John Arbash Meinel
Implement a compiled extension for parsing the text key out of a CHKInventory value. |
1736 |
_bytes_to_text_key, |
4241.6.1
by Ian Clatworthy
chk_map code from brisbane-core |
1737 |
_search_key_16, |
1738 |
_search_key_255, |
|
1739 |
_deserialise_leaf_node, |
|
1740 |
_deserialise_internal_node, |
|
1741 |
)
|
|
1742 |
search_key_registry.register('hash-16-way', _search_key_16) |
|
1743 |
search_key_registry.register('hash-255-way', _search_key_255) |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1744 |
|
4679.9.15
by John Arbash Meinel
Cleanup some code paths. Make _check_key a helper that can be used |
1745 |
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1746 |
def _check_key(key): |
4679.9.15
by John Arbash Meinel
Cleanup some code paths. Make _check_key a helper that can be used |
1747 |
"""Helper function to assert that a key is properly formatted.
|
1748 |
||
1749 |
This generally shouldn't be used in production code, but it can be helpful
|
|
1750 |
to debug problems.
|
|
1751 |
"""
|
|
4679.9.4
by John Arbash Meinel
A bit broken, but getting there. |
1752 |
if type(key) is not StaticTuple: |
1753 |
raise TypeError('key %r is not StaticTuple but %s' % (key, type(key))) |
|
1754 |
if len(key) != 1: |
|
1755 |
raise ValueError('key %r should have length 1, not %d' % (key, len(key),)) |
|
1756 |
if type(key[0]) is not str: |
|
1757 |
raise TypeError('key %r should hold a str, not %r' |
|
1758 |
% (key, type(key[0]))) |
|
1759 |
if not key[0].startswith('sha1:'): |
|
1760 |
raise ValueError('key %r should point to a sha1:' % (key,)) |
|
1761 |
||
1762 |