1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
from bisect import bisect_right
23
from copy import deepcopy
39
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
40
from bzrlib.osutils import basename, dirname
41
from bzrlib.transport import get_transport
44
_BTSIGNATURE = "B+Tree Graph Index 2\n"
45
_OPTION_ROW_LENGTHS = "row_lengths="
46
_LEAF_FLAG = "type=leaf\n"
47
_INTERNAL_FLAG = "type=internal\n"
48
_INTERNAL_OFFSET = "offset="
50
_RESERVED_HEADER_BYTES = 120
53
# 4K per page: 4MB - 1000 entries
54
_NODE_CACHE_SIZE = 1000
56
leaf_value_hits = [0, 0]
57
internal_node_hits = [0, 0]
58
leaf_node_hits = [0, 0]
59
miss_attempts = 0 # Missed this entry while looking up
60
bisect_shortcut = [0, 0]
64
class _BuilderRow(object):
65
"""The stored state accumulated while writing out a row in the index.
67
:ivar spool: A temporary file used to accumulate nodes for this row
69
:ivar nodes: The count of nodes emitted so far.
73
"""Create a _BuilderRow."""
75
self.spool = tempfile.TemporaryFile()
78
def finish_node(self, pad=True):
79
byte_lines, _, padding = self.writer.finish()
82
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
84
if not pad and padding:
86
skipped_bytes = padding
87
self.spool.writelines(byte_lines)
88
if (self.spool.tell() + skipped_bytes) % _PAGE_SIZE != 0:
89
raise AssertionError("incorrect node length")
94
class _InternalBuilderRow(_BuilderRow):
95
"""The stored state accumulated while writing out internal rows."""
97
def finish_node(self, pad=True):
99
raise AssertionError("Must pad internal nodes only.")
100
_BuilderRow.finish_node(self)
103
class _LeafBuilderRow(_BuilderRow):
104
"""The stored state accumulated while writing out a leaf rows."""
107
class BTreeBuilder(index.GraphIndexBuilder):
108
"""A Builder for B+Tree based Graph indices.
110
The resulting graph has the structure:
112
_SIGNATURE OPTIONS NODES
113
_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
114
OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
115
REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
116
KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
117
LENGTH := 'len=' DIGITS NEWLINE
118
ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
119
NODES := NODE_COMPRESSED*
120
NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
121
NODE_RAW := INTERNAL | LEAF
122
INTERNAL := INTERNAL_FLAG POINTERS
123
LEAF := LEAF_FLAG ROWS
124
KEY_ELEMENT := Not-whitespace-utf8
125
KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
127
ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
129
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
130
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
132
VALUE := no-newline-no-null-bytes
135
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
136
"""See GraphIndexBuilder.__init__.
138
:param spill_at: Optional parameter controlling the maximum number
139
of nodes that BTreeBuilder will hold in memory.
141
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
142
key_elements=key_elements)
143
self._spill_at = spill_at
144
self._backing_indices = []
146
def add_node(self, key, value, references=()):
147
"""Add a node to the index.
149
If adding the node causes the builder to reach its spill_at threshold,
150
disk spilling will be triggered.
152
:param key: The key. keys are non-empty tuples containing
153
as many whitespace-free utf8 bytestrings as the key length
154
defined for this index.
155
:param references: An iterable of iterables of keys. Each is a
156
reference to another key.
157
:param value: The value to associate with the key. It may be any
158
bytes as long as it does not contain \0 or \n.
160
index.GraphIndexBuilder.add_node(self, key, value, references=references)
161
if len(self._keys) < self._spill_at:
163
iterators_to_combine = [iter(sorted(self._iter_mem_nodes()))]
165
for pos, backing in enumerate(self._backing_indices):
169
iterators_to_combine.append(backing.iter_all_entries())
170
backing_pos = pos + 1
171
new_backing_file, size = \
172
self._write_nodes(self._iter_smallest(iterators_to_combine))
173
new_backing = BTreeGraphIndex(
174
get_transport(dirname(new_backing_file.name)),
175
basename(new_backing_file.name), size)
176
# GC will clean up the file
177
new_backing._file = new_backing_file
178
if len(self._backing_indices) == backing_pos:
179
self._backing_indices.append(None)
180
self._backing_indices[backing_pos] = new_backing
181
for pos in range(backing_pos):
182
self._backing_indices[pos] = None
185
self._nodes_by_key = {}
187
def add_nodes(self, nodes):
188
"""Add nodes to the index.
190
:param nodes: An iterable of (key, node_refs, value) entries to add.
192
if self.reference_lists:
193
for (key, value, node_refs) in nodes:
194
self.add_node(key, value, node_refs)
196
for (key, value) in nodes:
197
self.add_node(key, value)
199
def _iter_mem_nodes(self):
200
"""Iterate over the nodes held in memory."""
201
if self.reference_lists:
202
for key, (absent, references, value) in self._nodes.iteritems():
204
yield self, key, value, references
206
for key, (absent, references, value) in self._nodes.iteritems():
208
yield self, key, value
210
def _iter_smallest(self, iterators_to_combine):
211
if len(iterators_to_combine) == 1:
212
for value in iterators_to_combine[0]:
216
for iterator in iterators_to_combine:
218
current_values.append(iterator.next())
219
except StopIteration:
220
current_values.append(None)
223
# Decorate candidates with the value to allow 2.4's min to be used.
224
candidates = [(item[1][1], item) for item
225
in enumerate(current_values) if item[1] is not None]
226
if not len(candidates):
228
selected = min(candidates)
229
# undecorate back to (pos, node)
230
selected = selected[1]
231
if last == selected[1][1]:
232
raise errors.BadIndexDuplicateKey(last, self)
233
last = selected[1][1]
234
# Yield, with self as the index
235
yield (self,) + selected[1][1:]
238
current_values[pos] = iterators_to_combine[pos].next()
239
except StopIteration:
240
current_values[pos] = None
242
def _add_key(self, string_key, line, rows):
243
"""Add a key to the current chunk.
245
:param string_key: The key to add.
246
:param line: The fully serialised key and value.
248
if rows[-1].writer is None:
249
# opening a new leaf chunk;
250
for pos, internal_row in enumerate(rows[:-1]):
251
# flesh out any internal nodes that are needed to
252
# preserve the height of the tree
253
if internal_row.writer is None:
255
if internal_row.nodes == 0:
256
length -= _RESERVED_HEADER_BYTES # padded
257
internal_row.writer = chunk_writer.ChunkWriter(length, 0)
258
internal_row.writer.write(_INTERNAL_FLAG)
259
internal_row.writer.write(_INTERNAL_OFFSET +
260
str(rows[pos + 1].nodes) + "\n")
263
if rows[-1].nodes == 0:
264
length -= _RESERVED_HEADER_BYTES # padded
265
rows[-1].writer = chunk_writer.ChunkWriter(length)
266
rows[-1].writer.write(_LEAF_FLAG)
267
if rows[-1].writer.write(line):
268
# this key did not fit in the node:
269
rows[-1].finish_node()
270
key_line = string_key + "\n"
272
for row in reversed(rows[:-1]):
273
# Mark the start of the next node in the node above. If it
274
# doesn't fit then propogate upwards until we find one that
276
if row.writer.write(key_line):
279
# We've found a node that can handle the pointer.
282
# If we reached the current root without being able to mark the
283
# division point, then we need a new root:
286
if 'index' in debug.debug_flags:
287
trace.mutter('Inserting new global row.')
288
new_row = _InternalBuilderRow()
290
rows.insert(0, new_row)
291
# This will be padded, hence the -100
292
new_row.writer = chunk_writer.ChunkWriter(
293
_PAGE_SIZE - _RESERVED_HEADER_BYTES,
295
new_row.writer.write(_INTERNAL_FLAG)
296
new_row.writer.write(_INTERNAL_OFFSET +
297
str(rows[1].nodes - 1) + "\n")
298
new_row.writer.write(key_line)
299
self._add_key(string_key, line, rows)
301
def _write_nodes(self, node_iterator):
302
"""Write node_iterator out as a B+Tree.
304
:param node_iterator: An iterator of sorted nodes. Each node should
305
match the output given by iter_all_entries.
306
:return: A file handle for a temporary file containing a B+Tree for
309
# The index rows - rows[0] is the root, rows[1] is the layer under it
312
# forward sorted by key. In future we may consider topological sorting,
313
# at the cost of table scans for direct lookup, or a second index for
316
# A stack with the number of nodes of each size. 0 is the root node
317
# and must always be 1 (if there are any nodes in the tree).
318
self.row_lengths = []
319
# Loop over all nodes adding them to the bottom row
320
# (rows[-1]). When we finish a chunk in a row,
321
# propogate the key that didn't fit (comes after the chunk) to the
322
# row above, transitively.
323
for node in node_iterator:
325
# First key triggers the first row
326
rows.append(_LeafBuilderRow())
328
# TODO: Flattening the node into a string key and a line should
329
# probably be put into a pyrex function. We can do a quick
330
# iter over all the entries to determine the final length,
331
# and then do a single malloc() rather than lots of
332
# intermediate mallocs as we build everything up.
333
# ATM 3 / 13s are spent flattening nodes (10s is compressing)
334
string_key, line = _btree_serializer._flatten_node(node,
335
self.reference_lists)
336
self._add_key(string_key, line, rows)
337
for row in reversed(rows):
338
pad = (type(row) != _LeafBuilderRow)
339
row.finish_node(pad=pad)
340
result = tempfile.NamedTemporaryFile()
341
lines = [_BTSIGNATURE]
342
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
343
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
344
lines.append(_OPTION_LEN + str(key_count) + '\n')
345
row_lengths = [row.nodes for row in rows]
346
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
347
result.writelines(lines)
348
position = sum(map(len, lines))
350
if position > _RESERVED_HEADER_BYTES:
351
raise AssertionError("Could not fit the header in the"
352
" reserved space: %d > %d"
353
% (position, _RESERVED_HEADER_BYTES))
354
# write the rows out:
356
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
359
# copy nodes to the finalised file.
360
# Special case the first node as it may be prefixed
361
node = row.spool.read(_PAGE_SIZE)
362
result.write(node[reserved:])
363
result.write("\x00" * (reserved - position))
364
position = 0 # Only the root row actually has an offset
365
copied_len = osutils.pumpfile(row.spool, result)
366
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
367
if type(row) != _LeafBuilderRow:
368
raise AssertionError("Not enough data copied")
375
"""Finalise the index.
377
:return: A file handle for a temporary file containing the nodes added
380
return self._write_nodes(self.iter_all_entries())[0]
382
def iter_all_entries(self):
383
"""Iterate over all keys within the index
385
:return: An iterable of (index, key, reference_lists, value). There is no
386
defined order for the result iteration - it will be in the most
387
efficient order for the index (in this case dictionary hash order).
389
if 'evil' in debug.debug_flags:
390
trace.mutter_callsite(3,
391
"iter_all_entries scales with size of history.")
392
# Doing serial rather than ordered would be faster; but this shouldn't
393
# be getting called routinely anyway.
394
iterators = [iter(sorted(self._iter_mem_nodes()))]
395
for backing in self._backing_indices:
396
if backing is not None:
397
iterators.append(backing.iter_all_entries())
398
if len(iterators) == 1:
400
return self._iter_smallest(iterators)
402
def iter_entries(self, keys):
403
"""Iterate over keys within the index.
405
:param keys: An iterable providing the keys to be retrieved.
406
:return: An iterable of (index, key, value, reference_lists). There is no
407
defined order for the result iteration - it will be in the most
408
efficient order for the index (keys iteration order in this case).
411
if self.reference_lists:
412
for key in keys.intersection(self._keys):
413
node = self._nodes[key]
415
yield self, key, node[2], node[1]
417
for key in keys.intersection(self._keys):
418
node = self._nodes[key]
420
yield self, key, node[2]
421
keys.difference_update(self._keys)
422
for backing in self._backing_indices:
427
for node in backing.iter_entries(keys):
429
yield (self,) + node[1:]
431
def iter_entries_prefix(self, keys):
432
"""Iterate over keys within the index using prefix matching.
434
Prefix matching is applied within the tuple of a key, not to within
435
the bytestring of each key element. e.g. if you have the keys ('foo',
436
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
437
only the former key is returned.
439
:param keys: An iterable providing the key prefixes to be retrieved.
440
Each key prefix takes the form of a tuple the length of a key, but
441
with the last N elements 'None' rather than a regular bytestring.
442
The first element cannot be 'None'.
443
:return: An iterable as per iter_all_entries, but restricted to the
444
keys with a matching prefix to those supplied. No additional keys
445
will be returned, and every match that is in the index will be
448
# XXX: To much duplication with the GraphIndex class; consider finding
449
# a good place to pull out the actual common logic.
453
for backing in self._backing_indices:
456
for node in backing.iter_entries_prefix(keys):
457
yield (self,) + node[1:]
458
if self._key_length == 1:
462
raise errors.BadIndexKey(key)
463
if len(key) != self._key_length:
464
raise errors.BadIndexKey(key)
466
node = self._nodes[key]
471
if self.reference_lists:
472
yield self, key, node[2], node[1]
474
yield self, key, node[2]
479
raise errors.BadIndexKey(key)
480
if len(key) != self._key_length:
481
raise errors.BadIndexKey(key)
482
# find what it refers to:
483
key_dict = self._nodes_by_key
485
# find the subdict to return
487
while len(elements) and elements[0] is not None:
488
key_dict = key_dict[elements[0]]
491
# a non-existant lookup.
496
key_dict = dicts.pop(-1)
497
# can't be empty or would not exist
498
item, value = key_dict.iteritems().next()
499
if type(value) == dict:
501
dicts.extend(key_dict.itervalues())
504
for value in key_dict.itervalues():
505
yield (self, ) + value
507
yield (self, ) + key_dict
510
"""Return an estimate of the number of keys in this index.
512
For InMemoryGraphIndex the estimate is exact.
514
return len(self._keys) + sum(backing.key_count() for backing in
515
self._backing_indices if backing is not None)
518
"""In memory index's have no known corruption at the moment."""
521
class _LeafNode(object):
522
"""A leaf node for a serialised B+Tree index."""
524
def __init__(self, bytes, key_length, ref_list_length):
525
"""Parse bytes to create a leaf node object."""
526
# splitlines mangles the \r delimiters.. don't use it.
527
self.keys = dict(_btree_serializer._parse_leaf_lines(bytes,
528
key_length, ref_list_length))
531
class _InternalNode(object):
532
"""An internal node for a serialised B+Tree index."""
534
def __init__(self, bytes):
535
"""Parse bytes to create an internal node object."""
536
# splitlines mangles the \r delimiters.. don't use it.
537
self.keys = self._parse_lines(bytes.split('\n'))
539
def _parse_lines(self, lines):
541
self.offset = int(lines[1][7:])
542
for line in lines[2:]:
545
nodes.append(tuple(line.split('\0')))
549
class BTreeGraphIndex(object):
550
"""Access to nodes via the standard GraphIndex interface for B+Tree's.
552
Individual nodes are held in a LRU cache. This holds the root node in
553
memory except when very large walks are done.
556
def __init__(self, transport, name, size):
557
"""Create a B+Tree index object on the index name.
559
:param transport: The transport to read data for the index from.
560
:param name: The file name of the index on transport.
561
:param size: Optional size of the index in bytes. This allows
562
compatibility with the GraphIndex API, as well as ensuring that
563
the initial read (to read the root node header) can be done
564
without over-reading even on empty indices, and on small indices
565
allows single-IO to read the entire index.
567
self._transport = transport
571
self._page_size = transport.recommended_page_size()
572
self._root_node = None
573
# Default max size is 100,000 leave values
574
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
575
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
576
self._internal_node_cache = lru_cache.LRUCache()
577
self._key_count = None
578
self._row_lengths = None
579
self._row_offsets = None # Start of each row, [-1] is the end
581
def __eq__(self, other):
582
"""Equal when self and other were created with the same parameters."""
584
type(self) == type(other) and
585
self._transport == other._transport and
586
self._name == other._name and
587
self._size == other._size)
589
def __ne__(self, other):
590
return not self.__eq__(other)
592
def _get_root_node(self):
593
if self._root_node is None:
594
# We may not have a root node yet
595
nodes = list(self._read_nodes([0]))
597
self._root_node = nodes[0][1]
598
return self._root_node
600
def _cache_nodes(self, nodes, cache):
601
"""Read nodes and cache them in the lru.
603
The nodes list supplied is sorted and then read from disk, each node
604
being inserted it into the _node_cache.
606
Note: Asking for more nodes than the _node_cache can contain will
607
result in some of the results being immediately discarded, to prevent
608
this an assertion is raised if more nodes are asked for than are
611
:return: A dict of {node_pos: node}
613
if len(nodes) > cache._max_cache:
614
trace.mutter('Requesting %s > %s nodes, not all will be cached',
615
len(nodes), cache._max_cache)
617
for node_pos, node in self._read_nodes(sorted(nodes)):
618
if node_pos == 0: # Special case
619
self._root_node = node
621
cache.add(node_pos, node)
622
found[node_pos] = node
625
def _get_nodes(self, cache, node_indexes, counter):
628
for idx in node_indexes:
629
if idx == 0 and self._root_node is not None:
630
found[0] = self._root_node
633
found[idx] = cache[idx]
638
found.update(self._cache_nodes(needed, cache))
641
def _get_internal_nodes(self, node_indexes):
642
"""Get a node, from cache or disk.
644
After getting it, the node will be cached.
646
return self._get_nodes(self._internal_node_cache, node_indexes,
649
def _get_leaf_nodes(self, node_indexes):
650
"""Get a bunch of nodes, from cache or disk."""
651
found = self._get_nodes(self._leaf_node_cache, node_indexes,
653
if self._leaf_value_cache is not None:
654
for node in found.itervalues():
655
for key, value in node.keys.iteritems():
656
if key in self._leaf_value_cache:
657
# Don't add the rest of the keys, we've seen this node
660
self._leaf_value_cache[key] = value
663
def iter_all_entries(self):
664
"""Iterate over all keys within the index.
666
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
667
The former tuple is used when there are no reference lists in the
668
index, making the API compatible with simple key:value index types.
669
There is no defined order for the result iteration - it will be in
670
the most efficient order for the index.
672
if 'evil' in debug.debug_flags:
673
trace.mutter_callsite(3,
674
"iter_all_entries scales with size of history.")
675
if not self.key_count():
677
start_of_leaves = self._row_offsets[-2]
678
end_of_leaves = self._row_offsets[-1]
679
needed_nodes = range(start_of_leaves, end_of_leaves)
680
# We iterate strictly in-order so that we can use this function
681
# for spilling index builds to disk.
682
if self.node_ref_lists:
683
for _, node in self._read_nodes(needed_nodes):
684
for key, (value, refs) in sorted(node.keys.items()):
685
yield (self, key, value, refs)
687
for _, node in self._read_nodes(needed_nodes):
688
for key, (value, refs) in sorted(node.keys.items()):
689
yield (self, key, value)
692
def _multi_bisect_right(in_keys, fixed_keys):
693
"""Find the positions where each 'in_key' would fit in fixed_keys.
695
This is equivalent to doing "bisect_right" on each in_key into
698
:param in_keys: A sorted list of keys to match with fixed_keys
699
:param fixed_keys: A sorted list of keys to match against
700
:return: A list of (integer position, [key list]) tuples.
705
# no pointers in the fixed_keys list, which means everything must
707
return [(0, in_keys)]
709
# TODO: Iterating both lists will generally take M + N steps
710
# Bisecting each key will generally take M * log2 N steps.
711
# If we had an efficient way to compare, we could pick the method
712
# based on which has the fewer number of steps.
713
# There is also the argument that bisect_right is a compiled
714
# function, so there is even more to be gained.
715
# iter_steps = len(in_keys) + len(fixed_keys)
716
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
717
if len(in_keys) == 1: # Bisect will always be faster for M = 1
718
bisect_shortcut[0] += 1
719
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
720
# elif bisect_steps < iter_steps:
721
# bisect_shortcut[0] += len(in_keys)
723
# for key in in_keys:
724
# offsets.setdefault(bisect_right(fixed_keys, key),
726
# return [(o, offsets[o]) for o in sorted(offsets)]
728
bisect_shortcut[1] += len(in_keys)
729
in_keys_iter = iter(in_keys)
730
fixed_keys_iter = enumerate(fixed_keys)
731
cur_in_key = in_keys_iter.next()
732
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
734
class InputDone(Exception): pass
735
class FixedDone(Exception): pass
740
# TODO: Another possibility is that rather than iterating on each side,
741
# we could use a combination of bisecting and iterating. For
742
# example, while cur_in_key < fixed_key, bisect to find its
743
# point, then iterate all matching keys, then bisect (restricted
744
# to only the remainder) for the next one, etc.
747
if cur_in_key < cur_fixed_key:
749
cur_out = (cur_fixed_offset, cur_keys)
750
output.append(cur_out)
751
while cur_in_key < cur_fixed_key:
752
cur_keys.append(cur_in_key)
754
cur_in_key = in_keys_iter.next()
755
except StopIteration:
757
# At this point cur_in_key must be >= cur_fixed_key
758
# step the cur_fixed_key until we pass the cur key, or walk off
760
while cur_in_key >= cur_fixed_key:
762
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
763
except StopIteration:
766
# We consumed all of the input, nothing more to do
769
# There was some input left, but we consumed all of fixed, so we
770
# have to add one more for the tail
771
cur_keys = [cur_in_key]
772
cur_keys.extend(in_keys_iter)
773
cur_out = (len(fixed_keys), cur_keys)
774
output.append(cur_out)
777
def iter_entries(self, keys):
778
"""Iterate over keys within the index.
780
:param keys: An iterable providing the keys to be retrieved.
781
:return: An iterable as per iter_all_entries, but restricted to the
782
keys supplied. No additional keys will be returned, and every
783
key supplied that is in the index will be returned.
785
# 6 seconds spent in miss_torture using the sorted() line.
786
# Even with out of order disk IO it seems faster not to sort it when
787
# large queries are being made.
788
# However, now that we are doing multi-way bisecting, we need the keys
789
# in sorted order anyway. We could change the multi-way code to not
790
# require sorted order. (For example, it bisects for the first node,
791
# does an in-order search until a key comes before the current point,
792
# which it then bisects for, etc.)
793
keys = frozenset(keys)
797
global leaf_value_hits, miss_attempts, dupes
798
if not self.key_count():
802
if self._leaf_value_cache is None:
806
value = self._leaf_value_cache.get(key, None)
807
if value is not None:
808
leaf_value_hits[0] += 1
809
# This key is known not to be here, skip it
811
if self.node_ref_lists:
812
yield (self, key, value, refs)
814
yield (self, key, value)
816
leaf_value_hits[1] += 1
817
needed_keys.append(key)
823
# 6 seconds spent in miss_torture using the sorted() line.
824
# Even with out of order disk IO it seems faster not to sort it when
825
# large queries are being made.
826
needed_keys = sorted(needed_keys)
828
nodes_and_keys = [(0, needed_keys)]
830
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
831
node_indexes = [idx for idx, s_keys in nodes_and_keys]
832
nodes = self._get_internal_nodes(node_indexes)
834
next_nodes_and_keys = []
835
for node_index, sub_keys in nodes_and_keys:
836
node = nodes[node_index]
837
positions = self._multi_bisect_right(sub_keys, node.keys)
838
node_offset = next_row_start + node.offset
839
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
840
for pos, s_keys in positions])
841
nodes_and_keys = next_nodes_and_keys
842
# We should now be at the _LeafNodes
843
node_indexes = [idx for idx, s_keys in nodes_and_keys]
845
# TODO: We may *not* want to always read all the nodes in one
846
# big go. Consider setting a max size on this.
848
nodes = self._get_leaf_nodes(node_indexes)
849
for node_index, sub_keys in nodes_and_keys:
852
node = nodes[node_index]
853
for next_sub_key in sub_keys:
854
if next_sub_key in node.keys:
855
value, refs = node.keys[next_sub_key]
856
if self.node_ref_lists:
857
yield (self, next_sub_key, value, refs)
859
yield (self, next_sub_key, value)
863
def iter_entries_prefix(self, keys):
864
"""Iterate over keys within the index using prefix matching.
866
Prefix matching is applied within the tuple of a key, not to within
867
the bytestring of each key element. e.g. if you have the keys ('foo',
868
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
869
only the former key is returned.
871
WARNING: Note that this method currently causes a full index parse
872
unconditionally (which is reasonably appropriate as it is a means for
873
thunking many small indices into one larger one and still supplies
874
iter_all_entries at the thunk layer).
876
:param keys: An iterable providing the key prefixes to be retrieved.
877
Each key prefix takes the form of a tuple the length of a key, but
878
with the last N elements 'None' rather than a regular bytestring.
879
The first element cannot be 'None'.
880
:return: An iterable as per iter_all_entries, but restricted to the
881
keys with a matching prefix to those supplied. No additional keys
882
will be returned, and every match that is in the index will be
885
keys = sorted(set(keys))
888
# Load if needed to check key lengths
889
if self._key_count is None:
890
self._get_root_node()
891
# TODO: only access nodes that can satisfy the prefixes we are looking
892
# for. For now, to meet API usage (as this function is not used by
893
# current bzrlib) just suck the entire index and iterate in memory.
895
if self.node_ref_lists:
896
if self._key_length == 1:
897
for _1, key, value, refs in self.iter_all_entries():
898
nodes[key] = value, refs
901
for _1, key, value, refs in self.iter_all_entries():
902
key_value = key, value, refs
903
# For a key of (foo, bar, baz) create
904
# _nodes_by_key[foo][bar][baz] = key_value
905
key_dict = nodes_by_key
906
for subkey in key[:-1]:
907
key_dict = key_dict.setdefault(subkey, {})
908
key_dict[key[-1]] = key_value
910
if self._key_length == 1:
911
for _1, key, value in self.iter_all_entries():
915
for _1, key, value in self.iter_all_entries():
916
key_value = key, value
917
# For a key of (foo, bar, baz) create
918
# _nodes_by_key[foo][bar][baz] = key_value
919
key_dict = nodes_by_key
920
for subkey in key[:-1]:
921
key_dict = key_dict.setdefault(subkey, {})
922
key_dict[key[-1]] = key_value
923
if self._key_length == 1:
927
raise errors.BadIndexKey(key)
928
if len(key) != self._key_length:
929
raise errors.BadIndexKey(key)
931
if self.node_ref_lists:
932
value, node_refs = nodes[key]
933
yield self, key, value, node_refs
935
yield self, key, nodes[key]
942
raise errors.BadIndexKey(key)
943
if len(key) != self._key_length:
944
raise errors.BadIndexKey(key)
945
# find what it refers to:
946
key_dict = nodes_by_key
948
# find the subdict whose contents should be returned.
950
while len(elements) and elements[0] is not None:
951
key_dict = key_dict[elements[0]]
954
# a non-existant lookup.
959
key_dict = dicts.pop(-1)
960
# can't be empty or would not exist
961
item, value = key_dict.iteritems().next()
962
if type(value) == dict:
964
dicts.extend(key_dict.itervalues())
967
for value in key_dict.itervalues():
968
# each value is the key:value:node refs tuple
970
yield (self, ) + value
972
# the last thing looked up was a terminal element
973
yield (self, ) + key_dict
976
"""Return an estimate of the number of keys in this index.
978
For BTreeGraphIndex the estimate is exact as it is contained in the
981
if self._key_count is None:
982
self._get_root_node()
983
return self._key_count
985
def _parse_header_from_bytes(self, bytes):
986
"""Parse the header from a region of bytes.
988
:param bytes: The data to parse.
989
:return: An offset, data tuple such as readv yields, for the unparsed
990
data. (which may be of length 0).
992
signature = bytes[0:len(self._signature())]
993
if not signature == self._signature():
994
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
995
lines = bytes[len(self._signature()):].splitlines()
996
options_line = lines[0]
997
if not options_line.startswith(_OPTION_NODE_REFS):
998
raise errors.BadIndexOptions(self)
1000
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1002
raise errors.BadIndexOptions(self)
1003
options_line = lines[1]
1004
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1005
raise errors.BadIndexOptions(self)
1007
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1009
raise errors.BadIndexOptions(self)
1010
options_line = lines[2]
1011
if not options_line.startswith(_OPTION_LEN):
1012
raise errors.BadIndexOptions(self)
1014
self._key_count = int(options_line[len(_OPTION_LEN):])
1016
raise errors.BadIndexOptions(self)
1017
options_line = lines[3]
1018
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1019
raise errors.BadIndexOptions(self)
1021
self._row_lengths = map(int, [length for length in
1022
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1025
raise errors.BadIndexOptions(self)
1028
for row in self._row_lengths:
1029
offsets.append(row_offset)
1031
offsets.append(row_offset)
1032
self._row_offsets = offsets
1034
# calculate the bytes we have processed
1035
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
1036
return header_end, bytes[header_end:]
1038
def _read_nodes(self, nodes):
1039
"""Read some nodes from disk into the LRU cache.
1041
This performs a readv to get the node data into memory, and parses each
1042
node, the yields it to the caller. The nodes are requested in the
1043
supplied order. If possible doing sort() on the list before requesting
1044
a read may improve performance.
1046
:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
1051
offset = index * _PAGE_SIZE
1054
# Root node - special case
1056
size = min(_PAGE_SIZE, self._size)
1058
stream = self._transport.get(self._name)
1059
start = stream.read(_PAGE_SIZE)
1060
# Avoid doing this again
1061
self._size = len(start)
1062
size = min(_PAGE_SIZE, self._size)
1064
size = min(size, self._size - offset)
1065
ranges.append((offset, size))
1068
if self._file is None:
1069
data_ranges = self._transport.readv(self._name, ranges)
1072
for offset, size in ranges:
1073
self._file.seek(offset)
1074
data_ranges.append((offset, self._file.read(size)))
1075
for offset, data in data_ranges:
1077
# extract the header
1078
offset, data = self._parse_header_from_bytes(data)
1081
bytes = zlib.decompress(data)
1082
if bytes.startswith(_LEAF_FLAG):
1083
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1084
elif bytes.startswith(_INTERNAL_FLAG):
1085
node = _InternalNode(bytes)
1087
raise AssertionError("Unknown node type for %r" % bytes)
1088
yield offset / _PAGE_SIZE, node
1090
def _signature(self):
1091
"""The file signature for this index type."""
1095
"""Validate that everything in the index can be accessed."""
1096
# just read and parse every node.
1097
self._get_root_node()
1098
if len(self._row_lengths) > 1:
1099
start_node = self._row_offsets[1]
1101
# We shouldn't be reading anything anyway
1103
node_end = self._row_offsets[-1]
1104
for node in self._read_nodes(range(start_node, node_end)):
1109
from bzrlib import _btree_serializer_c as _btree_serializer
1111
from bzrlib import _btree_serializer_py as _btree_serializer