39
39
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
40
from bzrlib.osutils import basename, dirname
41
40
from bzrlib.transport import get_transport
53
52
# 4K per page: 4MB - 1000 entries
54
53
_NODE_CACHE_SIZE = 1000
56
leaf_value_hits = [0, 0]
57
internal_node_hits = [0, 0]
58
leaf_node_hits = [0, 0]
59
miss_attempts = 0 # Missed this entry while looking up
60
bisect_shortcut = [0, 0]
64
56
class _BuilderRow(object):
65
57
"""The stored state accumulated while writing out a row in the index.
86
78
skipped_bytes = padding
87
79
self.spool.writelines(byte_lines)
88
if (self.spool.tell() + skipped_bytes) % _PAGE_SIZE != 0:
89
raise AssertionError("incorrect node length")
80
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
82
raise AssertionError("incorrect node length: %d, %d"
83
% (self.spool.tell(), remainder))
142
136
key_elements=key_elements)
143
137
self._spill_at = spill_at
144
138
self._backing_indices = []
139
# A map of {key: (node_refs, value)}
141
# Indicate it hasn't been built yet
142
self._nodes_by_key = None
146
144
def add_node(self, key, value, references=()):
147
145
"""Add a node to the index.
157
155
:param value: The value to associate with the key. It may be any
158
156
bytes as long as it does not contain \0 or \n.
160
index.GraphIndexBuilder.add_node(self, key, value, references=references)
158
# we don't care about absent_references
159
node_refs, _ = self._check_key_ref_value(key, references, value)
160
if key in self._nodes:
161
raise errors.BadIndexDuplicateKey(key, self)
162
self._nodes[key] = (node_refs, value)
164
if self._nodes_by_key is not None and self._key_length > 1:
165
self._update_nodes_by_key(key, value, node_refs)
161
166
if len(self._keys) < self._spill_at:
163
iterators_to_combine = [iter(sorted(self._iter_mem_nodes()))]
168
self._spill_mem_keys_to_disk()
170
def _spill_mem_keys_to_disk(self):
171
"""Write the in memory keys down to disk to cap memory consumption.
173
If we already have some keys written to disk, we will combine them so
174
as to preserve the sorted order. The algorithm for combining uses
175
powers of two. So on the first spill, write all mem nodes into a
176
single index. On the second spill, combine the mem nodes with the nodes
177
on disk to create a 2x sized disk index and get rid of the first index.
178
On the third spill, create a single new disk index, which will contain
179
the mem nodes, and preserve the existing 2x sized index. On the fourth,
180
combine mem with the first and second indexes, creating a new one of
181
size 4x. On the fifth create a single new one, etc.
183
iterators_to_combine = [self._iter_mem_nodes()]
165
185
for pos, backing in enumerate(self._backing_indices):
166
186
if backing is None:
170
190
backing_pos = pos + 1
171
191
new_backing_file, size = \
172
192
self._write_nodes(self._iter_smallest(iterators_to_combine))
173
new_backing = BTreeGraphIndex(
174
get_transport(dirname(new_backing_file.name)),
175
basename(new_backing_file.name), size)
193
dir_path, base_name = osutils.split(new_backing_file.name)
194
# Note: The transport here isn't strictly needed, because we will use
195
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport(dir_path),
176
198
# GC will clean up the file
177
199
new_backing._file = new_backing_file
178
200
if len(self._backing_indices) == backing_pos:
182
204
self._backing_indices[pos] = None
183
205
self._keys = set()
185
self._nodes_by_key = {}
207
self._nodes_by_key = None
187
209
def add_nodes(self, nodes):
188
210
"""Add nodes to the index.
199
221
def _iter_mem_nodes(self):
200
222
"""Iterate over the nodes held in memory."""
201
224
if self.reference_lists:
202
for key, (absent, references, value) in self._nodes.iteritems():
204
yield self, key, value, references
225
for key in sorted(nodes):
226
references, value = nodes[key]
227
yield self, key, value, references
206
for key, (absent, references, value) in self._nodes.iteritems():
208
yield self, key, value
229
for key in sorted(nodes):
230
references, value = nodes[key]
231
yield self, key, value
210
233
def _iter_smallest(self, iterators_to_combine):
211
234
if len(iterators_to_combine) == 1:
365
388
copied_len = osutils.pumpfile(row.spool, result)
366
389
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
367
390
if type(row) != _LeafBuilderRow:
368
raise AssertionError("Not enough data copied")
391
raise AssertionError("Incorrect amount of data copied"
392
" expected: %d, got: %d"
393
% ((row.nodes - 1) * _PAGE_SIZE,
370
396
size = result.tell()
391
417
"iter_all_entries scales with size of history.")
392
418
# Doing serial rather than ordered would be faster; but this shouldn't
393
419
# be getting called routinely anyway.
394
iterators = [iter(sorted(self._iter_mem_nodes()))]
420
iterators = [self._iter_mem_nodes()]
395
421
for backing in self._backing_indices:
396
422
if backing is not None:
397
423
iterators.append(backing.iter_all_entries())
411
437
if self.reference_lists:
412
438
for key in keys.intersection(self._keys):
413
439
node = self._nodes[key]
415
yield self, key, node[2], node[1]
440
yield self, key, node[1], node[0]
417
442
for key in keys.intersection(self._keys):
418
443
node = self._nodes[key]
420
yield self, key, node[2]
444
yield self, key, node[1]
421
445
keys.difference_update(self._keys)
422
446
for backing in self._backing_indices:
423
447
if backing is None:
466
490
node = self._nodes[key]
471
493
if self.reference_lists:
472
yield self, key, node[2], node[1]
494
yield self, key, node[1], node[0]
474
yield self, key, node[2]
496
yield self, key, node[1]
480
502
if len(key) != self._key_length:
481
503
raise errors.BadIndexKey(key)
482
504
# find what it refers to:
483
key_dict = self._nodes_by_key
505
key_dict = self._get_nodes_by_key()
484
506
elements = list(key)
485
507
# find the subdict to return
507
529
yield (self, ) + key_dict
531
def _get_nodes_by_key(self):
532
if self._nodes_by_key is None:
534
if self.reference_lists:
535
for key, (references, value) in self._nodes.iteritems():
536
key_dict = nodes_by_key
537
for subkey in key[:-1]:
538
key_dict = key_dict.setdefault(subkey, {})
539
key_dict[key[-1]] = key, value, references
541
for key, (references, value) in self._nodes.iteritems():
542
key_dict = nodes_by_key
543
for subkey in key[:-1]:
544
key_dict = key_dict.setdefault(subkey, {})
545
key_dict[key[-1]] = key, value
546
self._nodes_by_key = nodes_by_key
547
return self._nodes_by_key
509
549
def key_count(self):
510
550
"""Return an estimate of the number of keys in this index.
644
682
After getting it, the node will be cached.
646
return self._get_nodes(self._internal_node_cache, node_indexes,
684
return self._get_nodes(self._internal_node_cache, node_indexes)
649
686
def _get_leaf_nodes(self, node_indexes):
650
687
"""Get a bunch of nodes, from cache or disk."""
651
found = self._get_nodes(self._leaf_node_cache, node_indexes,
688
found = self._get_nodes(self._leaf_node_cache, node_indexes)
653
689
if self._leaf_value_cache is not None:
654
690
for node in found.itervalues():
655
691
for key, value in node.keys.iteritems():
715
751
# iter_steps = len(in_keys) + len(fixed_keys)
716
752
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
717
753
if len(in_keys) == 1: # Bisect will always be faster for M = 1
718
bisect_shortcut[0] += 1
719
754
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
720
755
# elif bisect_steps < iter_steps:
721
# bisect_shortcut[0] += len(in_keys)
723
757
# for key in in_keys:
724
758
# offsets.setdefault(bisect_right(fixed_keys, key),
725
759
# []).append(key)
726
760
# return [(o, offsets[o]) for o in sorted(offsets)]
728
bisect_shortcut[1] += len(in_keys)
729
761
in_keys_iter = iter(in_keys)
730
762
fixed_keys_iter = enumerate(fixed_keys)
731
763
cur_in_key = in_keys_iter.next()
857
886
yield (self, next_sub_key, value, refs)
859
888
yield (self, next_sub_key, value)
863
890
def iter_entries_prefix(self, keys):
864
891
"""Iterate over keys within the index using prefix matching.