1
# Copyright (C) 2008-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
from __future__ import absolute_import
24
from bzrlib.lazy_import import lazy_import
25
lazy_import(globals(), """
44
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
47
_BTSIGNATURE = "B+Tree Graph Index 2\n"
48
_OPTION_ROW_LENGTHS = "row_lengths="
49
_LEAF_FLAG = "type=leaf\n"
50
_INTERNAL_FLAG = "type=internal\n"
51
_INTERNAL_OFFSET = "offset="
53
_RESERVED_HEADER_BYTES = 120
56
# 4K per page: 4MB - 1000 entries
57
_NODE_CACHE_SIZE = 1000
60
class _BuilderRow(object):
61
"""The stored state accumulated while writing out a row in the index.
63
:ivar spool: A temporary file used to accumulate nodes for this row
65
:ivar nodes: The count of nodes emitted so far.
69
"""Create a _BuilderRow."""
71
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
74
def finish_node(self, pad=True):
75
byte_lines, _, padding = self.writer.finish()
77
self.spool = cStringIO.StringIO()
79
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
81
# We got bigger than 1 node, switch to a temp file
82
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
83
spool.write(self.spool.getvalue())
86
if not pad and padding:
88
skipped_bytes = padding
89
self.spool.writelines(byte_lines)
90
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
92
raise AssertionError("incorrect node length: %d, %d"
93
% (self.spool.tell(), remainder))
98
class _InternalBuilderRow(_BuilderRow):
99
"""The stored state accumulated while writing out internal rows."""
101
def finish_node(self, pad=True):
103
raise AssertionError("Must pad internal nodes only.")
104
_BuilderRow.finish_node(self)
107
class _LeafBuilderRow(_BuilderRow):
108
"""The stored state accumulated while writing out a leaf rows."""
111
class BTreeBuilder(index.GraphIndexBuilder):
112
"""A Builder for B+Tree based Graph indices.
114
The resulting graph has the structure:
116
_SIGNATURE OPTIONS NODES
117
_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
118
OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
119
REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
120
KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
121
LENGTH := 'len=' DIGITS NEWLINE
122
ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
123
NODES := NODE_COMPRESSED*
124
NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
125
NODE_RAW := INTERNAL | LEAF
126
INTERNAL := INTERNAL_FLAG POINTERS
127
LEAF := LEAF_FLAG ROWS
128
KEY_ELEMENT := Not-whitespace-utf8
129
KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
131
ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
133
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
134
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
136
VALUE := no-newline-no-null-bytes
139
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
140
"""See GraphIndexBuilder.__init__.
142
:param spill_at: Optional parameter controlling the maximum number
143
of nodes that BTreeBuilder will hold in memory.
145
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
146
key_elements=key_elements)
147
self._spill_at = spill_at
148
self._backing_indices = []
149
# A map of {key: (node_refs, value)}
151
# Indicate it hasn't been built yet
152
self._nodes_by_key = None
153
self._optimize_for_size = False
155
def add_node(self, key, value, references=()):
156
"""Add a node to the index.
158
If adding the node causes the builder to reach its spill_at threshold,
159
disk spilling will be triggered.
161
:param key: The key. keys are non-empty tuples containing
162
as many whitespace-free utf8 bytestrings as the key length
163
defined for this index.
164
:param references: An iterable of iterables of keys. Each is a
165
reference to another key.
166
:param value: The value to associate with the key. It may be any
167
bytes as long as it does not contain \\0 or \\n.
169
# Ensure that 'key' is a StaticTuple
170
key = static_tuple.StaticTuple.from_sequence(key).intern()
171
# we don't care about absent_references
172
node_refs, _ = self._check_key_ref_value(key, references, value)
173
if key in self._nodes:
174
raise errors.BadIndexDuplicateKey(key, self)
175
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
176
if self._nodes_by_key is not None and self._key_length > 1:
177
self._update_nodes_by_key(key, value, node_refs)
178
if len(self._nodes) < self._spill_at:
180
self._spill_mem_keys_to_disk()
182
def _spill_mem_keys_to_disk(self):
183
"""Write the in memory keys down to disk to cap memory consumption.
185
If we already have some keys written to disk, we will combine them so
186
as to preserve the sorted order. The algorithm for combining uses
187
powers of two. So on the first spill, write all mem nodes into a
188
single index. On the second spill, combine the mem nodes with the nodes
189
on disk to create a 2x sized disk index and get rid of the first index.
190
On the third spill, create a single new disk index, which will contain
191
the mem nodes, and preserve the existing 2x sized index. On the fourth,
192
combine mem with the first and second indexes, creating a new one of
193
size 4x. On the fifth create a single new one, etc.
195
if self._combine_backing_indices:
196
(new_backing_file, size,
197
backing_pos) = self._spill_mem_keys_and_combine()
199
new_backing_file, size = self._spill_mem_keys_without_combining()
200
# Note: The transport here isn't strictly needed, because we will use
201
# direct access to the new_backing._file object
202
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
204
# GC will clean up the file
205
new_backing._file = new_backing_file
206
if self._combine_backing_indices:
207
if len(self._backing_indices) == backing_pos:
208
self._backing_indices.append(None)
209
self._backing_indices[backing_pos] = new_backing
210
for backing_pos in range(backing_pos):
211
self._backing_indices[backing_pos] = None
213
self._backing_indices.append(new_backing)
215
self._nodes_by_key = None
217
def _spill_mem_keys_without_combining(self):
218
return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
220
def _spill_mem_keys_and_combine(self):
221
iterators_to_combine = [self._iter_mem_nodes()]
223
for pos, backing in enumerate(self._backing_indices):
227
iterators_to_combine.append(backing.iter_all_entries())
228
backing_pos = pos + 1
229
new_backing_file, size = \
230
self._write_nodes(self._iter_smallest(iterators_to_combine),
231
allow_optimize=False)
232
return new_backing_file, size, backing_pos
234
def add_nodes(self, nodes):
235
"""Add nodes to the index.
237
:param nodes: An iterable of (key, node_refs, value) entries to add.
239
if self.reference_lists:
240
for (key, value, node_refs) in nodes:
241
self.add_node(key, value, node_refs)
243
for (key, value) in nodes:
244
self.add_node(key, value)
246
def _iter_mem_nodes(self):
247
"""Iterate over the nodes held in memory."""
249
if self.reference_lists:
250
for key in sorted(nodes):
251
references, value = nodes[key]
252
yield self, key, value, references
254
for key in sorted(nodes):
255
references, value = nodes[key]
256
yield self, key, value
258
def _iter_smallest(self, iterators_to_combine):
259
if len(iterators_to_combine) == 1:
260
for value in iterators_to_combine[0]:
264
for iterator in iterators_to_combine:
266
current_values.append(iterator.next())
267
except StopIteration:
268
current_values.append(None)
271
# Decorate candidates with the value to allow 2.4's min to be used.
272
candidates = [(item[1][1], item) for item
273
in enumerate(current_values) if item[1] is not None]
274
if not len(candidates):
276
selected = min(candidates)
277
# undecorate back to (pos, node)
278
selected = selected[1]
279
if last == selected[1][1]:
280
raise errors.BadIndexDuplicateKey(last, self)
281
last = selected[1][1]
282
# Yield, with self as the index
283
yield (self,) + selected[1][1:]
286
current_values[pos] = iterators_to_combine[pos].next()
287
except StopIteration:
288
current_values[pos] = None
290
def _add_key(self, string_key, line, rows, allow_optimize=True):
291
"""Add a key to the current chunk.
293
:param string_key: The key to add.
294
:param line: The fully serialised key and value.
295
:param allow_optimize: If set to False, prevent setting the optimize
296
flag when writing out. This is used by the _spill_mem_keys_to_disk
300
if rows[-1].writer is None:
301
# opening a new leaf chunk;
303
for pos, internal_row in enumerate(rows[:-1]):
304
# flesh out any internal nodes that are needed to
305
# preserve the height of the tree
306
if internal_row.writer is None:
308
if internal_row.nodes == 0:
309
length -= _RESERVED_HEADER_BYTES # padded
311
optimize_for_size = self._optimize_for_size
313
optimize_for_size = False
314
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
315
optimize_for_size=optimize_for_size)
316
internal_row.writer.write(_INTERNAL_FLAG)
317
internal_row.writer.write(_INTERNAL_OFFSET +
318
str(rows[pos + 1].nodes) + "\n")
321
if rows[-1].nodes == 0:
322
length -= _RESERVED_HEADER_BYTES # padded
323
rows[-1].writer = chunk_writer.ChunkWriter(length,
324
optimize_for_size=self._optimize_for_size)
325
rows[-1].writer.write(_LEAF_FLAG)
326
if rows[-1].writer.write(line):
327
# if we failed to write, despite having an empty page to write to,
328
# then line is too big. raising the error avoids infinite recursion
329
# searching for a suitably large page that will not be found.
331
raise errors.BadIndexKey(string_key)
332
# this key did not fit in the node:
333
rows[-1].finish_node()
334
key_line = string_key + "\n"
336
for row in reversed(rows[:-1]):
337
# Mark the start of the next node in the node above. If it
338
# doesn't fit then propagate upwards until we find one that
340
if row.writer.write(key_line):
343
# We've found a node that can handle the pointer.
346
# If we reached the current root without being able to mark the
347
# division point, then we need a new root:
350
if 'index' in debug.debug_flags:
351
trace.mutter('Inserting new global row.')
352
new_row = _InternalBuilderRow()
354
rows.insert(0, new_row)
355
# This will be padded, hence the -100
356
new_row.writer = chunk_writer.ChunkWriter(
357
_PAGE_SIZE - _RESERVED_HEADER_BYTES,
359
optimize_for_size=self._optimize_for_size)
360
new_row.writer.write(_INTERNAL_FLAG)
361
new_row.writer.write(_INTERNAL_OFFSET +
362
str(rows[1].nodes - 1) + "\n")
363
new_row.writer.write(key_line)
364
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
366
def _write_nodes(self, node_iterator, allow_optimize=True):
367
"""Write node_iterator out as a B+Tree.
369
:param node_iterator: An iterator of sorted nodes. Each node should
370
match the output given by iter_all_entries.
371
:param allow_optimize: If set to False, prevent setting the optimize
372
flag when writing out. This is used by the _spill_mem_keys_to_disk
374
:return: A file handle for a temporary file containing a B+Tree for
377
# The index rows - rows[0] is the root, rows[1] is the layer under it
380
# forward sorted by key. In future we may consider topological sorting,
381
# at the cost of table scans for direct lookup, or a second index for
384
# A stack with the number of nodes of each size. 0 is the root node
385
# and must always be 1 (if there are any nodes in the tree).
386
self.row_lengths = []
387
# Loop over all nodes adding them to the bottom row
388
# (rows[-1]). When we finish a chunk in a row,
389
# propagate the key that didn't fit (comes after the chunk) to the
390
# row above, transitively.
391
for node in node_iterator:
393
# First key triggers the first row
394
rows.append(_LeafBuilderRow())
396
string_key, line = _btree_serializer._flatten_node(node,
397
self.reference_lists)
398
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
399
for row in reversed(rows):
400
pad = (type(row) != _LeafBuilderRow)
401
row.finish_node(pad=pad)
402
lines = [_BTSIGNATURE]
403
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
404
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
405
lines.append(_OPTION_LEN + str(key_count) + '\n')
406
row_lengths = [row.nodes for row in rows]
407
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
408
if row_lengths and row_lengths[-1] > 1:
409
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
411
result = cStringIO.StringIO()
412
result.writelines(lines)
413
position = sum(map(len, lines))
415
if position > _RESERVED_HEADER_BYTES:
416
raise AssertionError("Could not fit the header in the"
417
" reserved space: %d > %d"
418
% (position, _RESERVED_HEADER_BYTES))
419
# write the rows out:
421
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
424
# copy nodes to the finalised file.
425
# Special case the first node as it may be prefixed
426
node = row.spool.read(_PAGE_SIZE)
427
result.write(node[reserved:])
428
if len(node) == _PAGE_SIZE:
429
result.write("\x00" * (reserved - position))
430
position = 0 # Only the root row actually has an offset
431
copied_len = osutils.pumpfile(row.spool, result)
432
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
433
if type(row) != _LeafBuilderRow:
434
raise AssertionError("Incorrect amount of data copied"
435
" expected: %d, got: %d"
436
% ((row.nodes - 1) * _PAGE_SIZE,
444
"""Finalise the index.
446
:return: A file handle for a temporary file containing the nodes added
449
return self._write_nodes(self.iter_all_entries())[0]
451
def iter_all_entries(self):
452
"""Iterate over all keys within the index
454
:return: An iterable of (index, key, value, reference_lists). There is
455
no defined order for the result iteration - it will be in the most
456
efficient order for the index (in this case dictionary hash order).
458
if 'evil' in debug.debug_flags:
459
trace.mutter_callsite(3,
460
"iter_all_entries scales with size of history.")
461
# Doing serial rather than ordered would be faster; but this shouldn't
462
# be getting called routinely anyway.
463
iterators = [self._iter_mem_nodes()]
464
for backing in self._backing_indices:
465
if backing is not None:
466
iterators.append(backing.iter_all_entries())
467
if len(iterators) == 1:
469
return self._iter_smallest(iterators)
471
def iter_entries(self, keys):
472
"""Iterate over keys within the index.
474
:param keys: An iterable providing the keys to be retrieved.
475
:return: An iterable of (index, key, value, reference_lists). There is no
476
defined order for the result iteration - it will be in the most
477
efficient order for the index (keys iteration order in this case).
480
# Note: We don't use keys.intersection() here. If you read the C api,
481
# set.intersection(other) special cases when other is a set and
482
# will iterate the smaller of the two and lookup in the other.
483
# It does *not* do this for any other type (even dict, unlike
484
# some other set functions.) Since we expect keys is generally <<
485
# self._nodes, it is faster to iterate over it in a list
488
local_keys = [key for key in keys if key in nodes]
489
if self.reference_lists:
490
for key in local_keys:
492
yield self, key, node[1], node[0]
494
for key in local_keys:
496
yield self, key, node[1]
497
# Find things that are in backing indices that have not been handled
499
if not self._backing_indices:
500
return # We won't find anything there either
501
# Remove all of the keys that we found locally
502
keys.difference_update(local_keys)
503
for backing in self._backing_indices:
508
for node in backing.iter_entries(keys):
510
yield (self,) + node[1:]
512
def iter_entries_prefix(self, keys):
513
"""Iterate over keys within the index using prefix matching.
515
Prefix matching is applied within the tuple of a key, not to within
516
the bytestring of each key element. e.g. if you have the keys ('foo',
517
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
518
only the former key is returned.
520
:param keys: An iterable providing the key prefixes to be retrieved.
521
Each key prefix takes the form of a tuple the length of a key, but
522
with the last N elements 'None' rather than a regular bytestring.
523
The first element cannot be 'None'.
524
:return: An iterable as per iter_all_entries, but restricted to the
525
keys with a matching prefix to those supplied. No additional keys
526
will be returned, and every match that is in the index will be
529
# XXX: To much duplication with the GraphIndex class; consider finding
530
# a good place to pull out the actual common logic.
534
for backing in self._backing_indices:
537
for node in backing.iter_entries_prefix(keys):
538
yield (self,) + node[1:]
539
if self._key_length == 1:
543
raise errors.BadIndexKey(key)
544
if len(key) != self._key_length:
545
raise errors.BadIndexKey(key)
547
node = self._nodes[key]
550
if self.reference_lists:
551
yield self, key, node[1], node[0]
553
yield self, key, node[1]
558
raise errors.BadIndexKey(key)
559
if len(key) != self._key_length:
560
raise errors.BadIndexKey(key)
561
# find what it refers to:
562
key_dict = self._get_nodes_by_key()
564
# find the subdict to return
566
while len(elements) and elements[0] is not None:
567
key_dict = key_dict[elements[0]]
570
# a non-existant lookup.
575
key_dict = dicts.pop(-1)
576
# can't be empty or would not exist
577
item, value = key_dict.iteritems().next()
578
if type(value) == dict:
580
dicts.extend(key_dict.itervalues())
583
for value in key_dict.itervalues():
584
yield (self, ) + tuple(value)
586
yield (self, ) + key_dict
588
def _get_nodes_by_key(self):
589
if self._nodes_by_key is None:
591
if self.reference_lists:
592
for key, (references, value) in self._nodes.iteritems():
593
key_dict = nodes_by_key
594
for subkey in key[:-1]:
595
key_dict = key_dict.setdefault(subkey, {})
596
key_dict[key[-1]] = key, value, references
598
for key, (references, value) in self._nodes.iteritems():
599
key_dict = nodes_by_key
600
for subkey in key[:-1]:
601
key_dict = key_dict.setdefault(subkey, {})
602
key_dict[key[-1]] = key, value
603
self._nodes_by_key = nodes_by_key
604
return self._nodes_by_key
607
"""Return an estimate of the number of keys in this index.
609
For InMemoryGraphIndex the estimate is exact.
611
return len(self._nodes) + sum(backing.key_count() for backing in
612
self._backing_indices if backing is not None)
615
"""In memory index's have no known corruption at the moment."""
618
class _LeafNode(dict):
619
"""A leaf node for a serialised B+Tree index."""
621
__slots__ = ('min_key', 'max_key', '_keys')
623
def __init__(self, bytes, key_length, ref_list_length):
624
"""Parse bytes to create a leaf node object."""
625
# splitlines mangles the \r delimiters.. don't use it.
626
key_list = _btree_serializer._parse_leaf_lines(bytes,
627
key_length, ref_list_length)
629
self.min_key = key_list[0][0]
630
self.max_key = key_list[-1][0]
632
self.min_key = self.max_key = None
633
super(_LeafNode, self).__init__(key_list)
634
self._keys = dict(self)
637
"""Return a sorted list of (key, (value, refs)) items"""
643
"""Return a sorted list of all keys."""
649
class _InternalNode(object):
650
"""An internal node for a serialised B+Tree index."""
652
__slots__ = ('keys', 'offset')
654
def __init__(self, bytes):
655
"""Parse bytes to create an internal node object."""
656
# splitlines mangles the \r delimiters.. don't use it.
657
self.keys = self._parse_lines(bytes.split('\n'))
659
def _parse_lines(self, lines):
661
self.offset = int(lines[1][7:])
662
as_st = static_tuple.StaticTuple.from_sequence
663
for line in lines[2:]:
666
nodes.append(as_st(map(intern, line.split('\0'))).intern())
670
class BTreeGraphIndex(object):
671
"""Access to nodes via the standard GraphIndex interface for B+Tree's.
673
Individual nodes are held in a LRU cache. This holds the root node in
674
memory except when very large walks are done.
677
def __init__(self, transport, name, size, unlimited_cache=False,
679
"""Create a B+Tree index object on the index name.
681
:param transport: The transport to read data for the index from.
682
:param name: The file name of the index on transport.
683
:param size: Optional size of the index in bytes. This allows
684
compatibility with the GraphIndex API, as well as ensuring that
685
the initial read (to read the root node header) can be done
686
without over-reading even on empty indices, and on small indices
687
allows single-IO to read the entire index.
688
:param unlimited_cache: If set to True, then instead of using an
689
LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
690
cache all leaf nodes.
691
:param offset: The start of the btree index data isn't byte 0 of the
692
file. Instead it starts at some point later.
694
self._transport = transport
698
self._recommended_pages = self._compute_recommended_pages()
699
self._root_node = None
700
self._base_offset = offset
701
self._leaf_factory = _LeafNode
702
# Default max size is 100,000 leave values
703
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
705
self._leaf_node_cache = {}
706
self._internal_node_cache = {}
708
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
709
# We use a FIFO here just to prevent possible blowout. However, a
710
# 300k record btree has only 3k leaf nodes, and only 20 internal
711
# nodes. A value of 100 scales to ~100*100*100 = 1M records.
712
self._internal_node_cache = fifo_cache.FIFOCache(100)
713
self._key_count = None
714
self._row_lengths = None
715
self._row_offsets = None # Start of each row, [-1] is the end
717
def __eq__(self, other):
718
"""Equal when self and other were created with the same parameters."""
720
type(self) == type(other) and
721
self._transport == other._transport and
722
self._name == other._name and
723
self._size == other._size)
725
def __ne__(self, other):
726
return not self.__eq__(other)
728
def _get_and_cache_nodes(self, nodes):
729
"""Read nodes and cache them in the lru.
731
The nodes list supplied is sorted and then read from disk, each node
732
being inserted it into the _node_cache.
734
Note: Asking for more nodes than the _node_cache can contain will
735
result in some of the results being immediately discarded, to prevent
736
this an assertion is raised if more nodes are asked for than are
739
:return: A dict of {node_pos: node}
742
start_of_leaves = None
743
for node_pos, node in self._read_nodes(sorted(nodes)):
744
if node_pos == 0: # Special case
745
self._root_node = node
747
if start_of_leaves is None:
748
start_of_leaves = self._row_offsets[-2]
749
if node_pos < start_of_leaves:
750
self._internal_node_cache[node_pos] = node
752
self._leaf_node_cache[node_pos] = node
753
found[node_pos] = node
756
def _compute_recommended_pages(self):
757
"""Convert transport's recommended_page_size into btree pages.
759
recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
760
pages fit in that length.
762
recommended_read = self._transport.recommended_page_size()
763
recommended_pages = int(math.ceil(recommended_read /
765
return recommended_pages
767
def _compute_total_pages_in_index(self):
768
"""How many pages are in the index.
770
If we have read the header we will use the value stored there.
771
Otherwise it will be computed based on the length of the index.
773
if self._size is None:
774
raise AssertionError('_compute_total_pages_in_index should not be'
775
' called when self._size is None')
776
if self._root_node is not None:
777
# This is the number of pages as defined by the header
778
return self._row_offsets[-1]
779
# This is the number of pages as defined by the size of the index. They
780
# should be indentical.
781
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
784
def _expand_offsets(self, offsets):
785
"""Find extra pages to download.
787
The idea is that we always want to make big-enough requests (like 64kB
788
for http), so that we don't waste round trips. So given the entries
789
that we already have cached and the new pages being downloaded figure
790
out what other pages we might want to read.
792
See also doc/developers/btree_index_prefetch.txt for more details.
794
:param offsets: The offsets to be read
795
:return: A list of offsets to download
797
if 'index' in debug.debug_flags:
798
trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
800
if len(offsets) >= self._recommended_pages:
801
# Don't add more, we are already requesting more than enough
802
if 'index' in debug.debug_flags:
803
trace.mutter(' not expanding large request (%s >= %s)',
804
len(offsets), self._recommended_pages)
806
if self._size is None:
807
# Don't try anything, because we don't know where the file ends
808
if 'index' in debug.debug_flags:
809
trace.mutter(' not expanding without knowing index size')
811
total_pages = self._compute_total_pages_in_index()
812
cached_offsets = self._get_offsets_to_cached_pages()
813
# If reading recommended_pages would read the rest of the index, just
815
if total_pages - len(cached_offsets) <= self._recommended_pages:
816
# Read whatever is left
818
expanded = [x for x in xrange(total_pages)
819
if x not in cached_offsets]
821
expanded = range(total_pages)
822
if 'index' in debug.debug_flags:
823
trace.mutter(' reading all unread pages: %s', expanded)
826
if self._root_node is None:
827
# ATM on the first read of the root node of a large index, we don't
828
# bother pre-reading any other pages. This is because the
829
# likelyhood of actually reading interesting pages is very low.
830
# See doc/developers/btree_index_prefetch.txt for a discussion, and
831
# a possible implementation when we are guessing that the second
832
# layer index is small
833
final_offsets = offsets
835
tree_depth = len(self._row_lengths)
836
if len(cached_offsets) < tree_depth and len(offsets) == 1:
837
# We haven't read enough to justify expansion
838
# If we are only going to read the root node, and 1 leaf node,
839
# then it isn't worth expanding our request. Once we've read at
840
# least 2 nodes, then we are probably doing a search, and we
841
# start expanding our requests.
842
if 'index' in debug.debug_flags:
843
trace.mutter(' not expanding on first reads')
845
final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
848
final_offsets = sorted(final_offsets)
849
if 'index' in debug.debug_flags:
850
trace.mutter('expanded: %s', final_offsets)
853
def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
854
"""Expand requests to neighbors until we have enough pages.
856
This is called from _expand_offsets after policy has determined that we
858
We only want to expand requests within a given layer. We cheat a little
859
bit and assume all requests will be in the same layer. This is true
860
given the current design, but if it changes this algorithm may perform
863
:param offsets: requested offsets
864
:param cached_offsets: offsets for pages we currently have cached
865
:return: A set() of offsets after expansion
867
final_offsets = set(offsets)
869
new_tips = set(final_offsets)
870
while len(final_offsets) < self._recommended_pages and new_tips:
874
first, end = self._find_layer_first_and_end(pos)
877
and previous not in cached_offsets
878
and previous not in final_offsets
879
and previous >= first):
880
next_tips.add(previous)
882
if (after < total_pages
883
and after not in cached_offsets
884
and after not in final_offsets
887
# This would keep us from going bigger than
888
# recommended_pages by only expanding the first offsets.
889
# However, if we are making a 'wide' request, it is
890
# reasonable to expand all points equally.
891
# if len(final_offsets) > recommended_pages:
893
final_offsets.update(next_tips)
897
def clear_cache(self):
898
"""Clear out any cached/memoized values.
900
This can be called at any time, but generally it is used when we have
901
extracted some information, but don't expect to be requesting any more
904
# Note that we don't touch self._root_node or self._internal_node_cache
905
# We don't expect either of those to be big, and it can save
906
# round-trips in the future. We may re-evaluate this if InternalNode
907
# memory starts to be an issue.
908
self._leaf_node_cache.clear()
910
def external_references(self, ref_list_num):
911
if self._root_node is None:
912
self._get_root_node()
913
if ref_list_num + 1 > self.node_ref_lists:
914
raise ValueError('No ref list %d, index has %d ref lists'
915
% (ref_list_num, self.node_ref_lists))
918
for node in self.iter_all_entries():
920
refs.update(node[3][ref_list_num])
923
def _find_layer_first_and_end(self, offset):
924
"""Find the start/stop nodes for the layer corresponding to offset.
926
:return: (first, end)
927
first is the first node in this layer
928
end is the first node of the next layer
931
for roffset in self._row_offsets:
938
def _get_offsets_to_cached_pages(self):
939
"""Determine what nodes we already have cached."""
940
cached_offsets = set(self._internal_node_cache.keys())
941
cached_offsets.update(self._leaf_node_cache.keys())
942
if self._root_node is not None:
943
cached_offsets.add(0)
944
return cached_offsets
946
def _get_root_node(self):
947
if self._root_node is None:
948
# We may not have a root node yet
949
self._get_internal_nodes([0])
950
return self._root_node
952
def _get_nodes(self, cache, node_indexes):
955
for idx in node_indexes:
956
if idx == 0 and self._root_node is not None:
957
found[0] = self._root_node
960
found[idx] = cache[idx]
965
needed = self._expand_offsets(needed)
966
found.update(self._get_and_cache_nodes(needed))
969
def _get_internal_nodes(self, node_indexes):
970
"""Get a node, from cache or disk.
972
After getting it, the node will be cached.
974
return self._get_nodes(self._internal_node_cache, node_indexes)
976
def _cache_leaf_values(self, nodes):
977
"""Cache directly from key => value, skipping the btree."""
978
if self._leaf_value_cache is not None:
979
for node in nodes.itervalues():
980
for key, value in node.all_items():
981
if key in self._leaf_value_cache:
982
# Don't add the rest of the keys, we've seen this node
985
self._leaf_value_cache[key] = value
987
def _get_leaf_nodes(self, node_indexes):
988
"""Get a bunch of nodes, from cache or disk."""
989
found = self._get_nodes(self._leaf_node_cache, node_indexes)
990
self._cache_leaf_values(found)
993
def iter_all_entries(self):
994
"""Iterate over all keys within the index.
996
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
997
The former tuple is used when there are no reference lists in the
998
index, making the API compatible with simple key:value index types.
999
There is no defined order for the result iteration - it will be in
1000
the most efficient order for the index.
1002
if 'evil' in debug.debug_flags:
1003
trace.mutter_callsite(3,
1004
"iter_all_entries scales with size of history.")
1005
if not self.key_count():
1007
if self._row_offsets[-1] == 1:
1008
# There is only the root node, and we read that via key_count()
1009
if self.node_ref_lists:
1010
for key, (value, refs) in self._root_node.all_items():
1011
yield (self, key, value, refs)
1013
for key, (value, refs) in self._root_node.all_items():
1014
yield (self, key, value)
1016
start_of_leaves = self._row_offsets[-2]
1017
end_of_leaves = self._row_offsets[-1]
1018
needed_offsets = range(start_of_leaves, end_of_leaves)
1019
if needed_offsets == [0]:
1020
# Special case when we only have a root node, as we have already
1022
nodes = [(0, self._root_node)]
1024
nodes = self._read_nodes(needed_offsets)
1025
# We iterate strictly in-order so that we can use this function
1026
# for spilling index builds to disk.
1027
if self.node_ref_lists:
1028
for _, node in nodes:
1029
for key, (value, refs) in node.all_items():
1030
yield (self, key, value, refs)
1032
for _, node in nodes:
1033
for key, (value, refs) in node.all_items():
1034
yield (self, key, value)
1037
def _multi_bisect_right(in_keys, fixed_keys):
1038
"""Find the positions where each 'in_key' would fit in fixed_keys.
1040
This is equivalent to doing "bisect_right" on each in_key into
1043
:param in_keys: A sorted list of keys to match with fixed_keys
1044
:param fixed_keys: A sorted list of keys to match against
1045
:return: A list of (integer position, [key list]) tuples.
1050
# no pointers in the fixed_keys list, which means everything must
1052
return [(0, in_keys)]
1054
# TODO: Iterating both lists will generally take M + N steps
1055
# Bisecting each key will generally take M * log2 N steps.
1056
# If we had an efficient way to compare, we could pick the method
1057
# based on which has the fewer number of steps.
1058
# There is also the argument that bisect_right is a compiled
1059
# function, so there is even more to be gained.
1060
# iter_steps = len(in_keys) + len(fixed_keys)
1061
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1062
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1063
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1064
# elif bisect_steps < iter_steps:
1066
# for key in in_keys:
1067
# offsets.setdefault(bisect_right(fixed_keys, key),
1069
# return [(o, offsets[o]) for o in sorted(offsets)]
1070
in_keys_iter = iter(in_keys)
1071
fixed_keys_iter = enumerate(fixed_keys)
1072
cur_in_key = in_keys_iter.next()
1073
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1075
class InputDone(Exception): pass
1076
class FixedDone(Exception): pass
1081
# TODO: Another possibility is that rather than iterating on each side,
1082
# we could use a combination of bisecting and iterating. For
1083
# example, while cur_in_key < fixed_key, bisect to find its
1084
# point, then iterate all matching keys, then bisect (restricted
1085
# to only the remainder) for the next one, etc.
1088
if cur_in_key < cur_fixed_key:
1090
cur_out = (cur_fixed_offset, cur_keys)
1091
output.append(cur_out)
1092
while cur_in_key < cur_fixed_key:
1093
cur_keys.append(cur_in_key)
1095
cur_in_key = in_keys_iter.next()
1096
except StopIteration:
1098
# At this point cur_in_key must be >= cur_fixed_key
1099
# step the cur_fixed_key until we pass the cur key, or walk off
1101
while cur_in_key >= cur_fixed_key:
1103
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1104
except StopIteration:
1107
# We consumed all of the input, nothing more to do
1110
# There was some input left, but we consumed all of fixed, so we
1111
# have to add one more for the tail
1112
cur_keys = [cur_in_key]
1113
cur_keys.extend(in_keys_iter)
1114
cur_out = (len(fixed_keys), cur_keys)
1115
output.append(cur_out)
1118
def _walk_through_internal_nodes(self, keys):
1119
"""Take the given set of keys, and find the corresponding LeafNodes.
1121
:param keys: An unsorted iterable of keys to search for
1122
:return: (nodes, index_and_keys)
1123
nodes is a dict mapping {index: LeafNode}
1124
keys_at_index is a list of tuples of [(index, [keys for Leaf])]
1126
# 6 seconds spent in miss_torture using the sorted() line.
1127
# Even with out of order disk IO it seems faster not to sort it when
1128
# large queries are being made.
1129
keys_at_index = [(0, sorted(keys))]
1131
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1132
node_indexes = [idx for idx, s_keys in keys_at_index]
1133
nodes = self._get_internal_nodes(node_indexes)
1135
next_nodes_and_keys = []
1136
for node_index, sub_keys in keys_at_index:
1137
node = nodes[node_index]
1138
positions = self._multi_bisect_right(sub_keys, node.keys)
1139
node_offset = next_row_start + node.offset
1140
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1141
for pos, s_keys in positions])
1142
keys_at_index = next_nodes_and_keys
1143
# We should now be at the _LeafNodes
1144
node_indexes = [idx for idx, s_keys in keys_at_index]
1146
# TODO: We may *not* want to always read all the nodes in one
1147
# big go. Consider setting a max size on this.
1148
nodes = self._get_leaf_nodes(node_indexes)
1149
return nodes, keys_at_index
1151
def iter_entries(self, keys):
1152
"""Iterate over keys within the index.
1154
:param keys: An iterable providing the keys to be retrieved.
1155
:return: An iterable as per iter_all_entries, but restricted to the
1156
keys supplied. No additional keys will be returned, and every
1157
key supplied that is in the index will be returned.
1159
# 6 seconds spent in miss_torture using the sorted() line.
1160
# Even with out of order disk IO it seems faster not to sort it when
1161
# large queries are being made.
1162
# However, now that we are doing multi-way bisecting, we need the keys
1163
# in sorted order anyway. We could change the multi-way code to not
1164
# require sorted order. (For example, it bisects for the first node,
1165
# does an in-order search until a key comes before the current point,
1166
# which it then bisects for, etc.)
1167
keys = frozenset(keys)
1171
if not self.key_count():
1175
if self._leaf_value_cache is None:
1179
value = self._leaf_value_cache.get(key, None)
1180
if value is not None:
1181
# This key is known not to be here, skip it
1183
if self.node_ref_lists:
1184
yield (self, key, value, refs)
1186
yield (self, key, value)
1188
needed_keys.append(key)
1194
nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
1195
for node_index, sub_keys in nodes_and_keys:
1198
node = nodes[node_index]
1199
for next_sub_key in sub_keys:
1200
if next_sub_key in node:
1201
value, refs = node[next_sub_key]
1202
if self.node_ref_lists:
1203
yield (self, next_sub_key, value, refs)
1205
yield (self, next_sub_key, value)
1207
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
1208
"""Find the parent_map information for the set of keys.
1210
This populates the parent_map dict and missing_keys set based on the
1211
queried keys. It also can fill out an arbitrary number of parents that
1212
it finds while searching for the supplied keys.
1214
It is unlikely that you want to call this directly. See
1215
"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
1217
:param keys: A keys whose ancestry we want to return
1218
Every key will either end up in 'parent_map' or 'missing_keys'.
1219
:param ref_list_num: This index in the ref_lists is the parents we
1221
:param parent_map: {key: parent_keys} for keys that are present in this
1222
index. This may contain more entries than were in 'keys', that are
1223
reachable ancestors of the keys requested.
1224
:param missing_keys: keys which are known to be missing in this index.
1225
This may include parents that were not directly requested, but we
1226
were able to determine that they are not present in this index.
1227
:return: search_keys parents that were found but not queried to know
1228
if they are missing or present. Callers can re-query this index for
1229
those keys, and they will be placed into parent_map or missing_keys
1231
if not self.key_count():
1232
# We use key_count() to trigger reading the root node and
1233
# determining info about this BTreeGraphIndex
1234
# If we don't have any keys, then everything is missing
1235
missing_keys.update(keys)
1237
if ref_list_num >= self.node_ref_lists:
1238
raise ValueError('No ref list %d, index has %d ref lists'
1239
% (ref_list_num, self.node_ref_lists))
1241
# The main trick we are trying to accomplish is that when we find a
1242
# key listing its parents, we expect that the parent key is also likely
1243
# to sit on the same page. Allowing us to expand parents quickly
1244
# without suffering the full stack of bisecting, etc.
1245
nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
1247
# These are parent keys which could not be immediately resolved on the
1248
# page where the child was present. Note that we may already be
1249
# searching for that key, and it may actually be present [or known
1250
# missing] on one of the other pages we are reading.
1252
# We could try searching for them in the immediate previous or next
1253
# page. If they occur "later" we could put them in a pending lookup
1254
# set, and then for each node we read thereafter we could check to
1255
# see if they are present.
1256
# However, we don't know the impact of keeping this list of things
1257
# that I'm going to search for every node I come across from here on
1259
# It doesn't handle the case when the parent key is missing on a
1260
# page that we *don't* read. So we already have to handle being
1261
# re-entrant for that.
1262
# Since most keys contain a date string, they are more likely to be
1263
# found earlier in the file than later, but we would know that right
1264
# away (key < min_key), and wouldn't keep searching it on every other
1265
# page that we read.
1266
# Mostly, it is an idea, one which should be benchmarked.
1267
parents_not_on_page = set()
1269
for node_index, sub_keys in nodes_and_keys:
1272
# sub_keys is all of the keys we are looking for that should exist
1273
# on this page, if they aren't here, then they won't be found
1274
node = nodes[node_index]
1275
parents_to_check = set()
1276
for next_sub_key in sub_keys:
1277
if next_sub_key not in node:
1278
# This one is just not present in the index at all
1279
missing_keys.add(next_sub_key)
1281
value, refs = node[next_sub_key]
1282
parent_keys = refs[ref_list_num]
1283
parent_map[next_sub_key] = parent_keys
1284
parents_to_check.update(parent_keys)
1285
# Don't look for things we've already found
1286
parents_to_check = parents_to_check.difference(parent_map)
1287
# this can be used to test the benefit of having the check loop
1289
# parents_not_on_page.update(parents_to_check)
1291
while parents_to_check:
1292
next_parents_to_check = set()
1293
for key in parents_to_check:
1295
value, refs = node[key]
1296
parent_keys = refs[ref_list_num]
1297
parent_map[key] = parent_keys
1298
next_parents_to_check.update(parent_keys)
1300
# This parent either is genuinely missing, or should be
1301
# found on another page. Perf test whether it is better
1302
# to check if this node should fit on this page or not.
1303
# in the 'everything-in-one-pack' scenario, this *not*
1304
# doing the check is 237ms vs 243ms.
1305
# So slightly better, but I assume the standard 'lots
1306
# of packs' is going to show a reasonable improvement
1307
# from the check, because it avoids 'going around
1308
# again' for everything that is in another index
1309
# parents_not_on_page.add(key)
1310
# Missing for some reason
1311
if key < node.min_key:
1312
# in the case of bzr.dev, 3.4k/5.3k misses are
1313
# 'earlier' misses (65%)
1314
parents_not_on_page.add(key)
1315
elif key > node.max_key:
1316
# This parent key would be present on a different
1318
parents_not_on_page.add(key)
1320
# assert key != node.min_key and key != node.max_key
1321
# If it was going to be present, it would be on
1322
# *this* page, so mark it missing.
1323
missing_keys.add(key)
1324
parents_to_check = next_parents_to_check.difference(parent_map)
1325
# Might want to do another .difference() from missing_keys
1326
# parents_not_on_page could have been found on a different page, or be
1327
# known to be missing. So cull out everything that has already been
1329
search_keys = parents_not_on_page.difference(
1330
parent_map).difference(missing_keys)
1333
def iter_entries_prefix(self, keys):
1334
"""Iterate over keys within the index using prefix matching.
1336
Prefix matching is applied within the tuple of a key, not to within
1337
the bytestring of each key element. e.g. if you have the keys ('foo',
1338
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1339
only the former key is returned.
1341
WARNING: Note that this method currently causes a full index parse
1342
unconditionally (which is reasonably appropriate as it is a means for
1343
thunking many small indices into one larger one and still supplies
1344
iter_all_entries at the thunk layer).
1346
:param keys: An iterable providing the key prefixes to be retrieved.
1347
Each key prefix takes the form of a tuple the length of a key, but
1348
with the last N elements 'None' rather than a regular bytestring.
1349
The first element cannot be 'None'.
1350
:return: An iterable as per iter_all_entries, but restricted to the
1351
keys with a matching prefix to those supplied. No additional keys
1352
will be returned, and every match that is in the index will be
1355
keys = sorted(set(keys))
1358
# Load if needed to check key lengths
1359
if self._key_count is None:
1360
self._get_root_node()
1361
# TODO: only access nodes that can satisfy the prefixes we are looking
1362
# for. For now, to meet API usage (as this function is not used by
1363
# current bzrlib) just suck the entire index and iterate in memory.
1365
if self.node_ref_lists:
1366
if self._key_length == 1:
1367
for _1, key, value, refs in self.iter_all_entries():
1368
nodes[key] = value, refs
1371
for _1, key, value, refs in self.iter_all_entries():
1372
key_value = key, value, refs
1373
# For a key of (foo, bar, baz) create
1374
# _nodes_by_key[foo][bar][baz] = key_value
1375
key_dict = nodes_by_key
1376
for subkey in key[:-1]:
1377
key_dict = key_dict.setdefault(subkey, {})
1378
key_dict[key[-1]] = key_value
1380
if self._key_length == 1:
1381
for _1, key, value in self.iter_all_entries():
1385
for _1, key, value in self.iter_all_entries():
1386
key_value = key, value
1387
# For a key of (foo, bar, baz) create
1388
# _nodes_by_key[foo][bar][baz] = key_value
1389
key_dict = nodes_by_key
1390
for subkey in key[:-1]:
1391
key_dict = key_dict.setdefault(subkey, {})
1392
key_dict[key[-1]] = key_value
1393
if self._key_length == 1:
1397
raise errors.BadIndexKey(key)
1398
if len(key) != self._key_length:
1399
raise errors.BadIndexKey(key)
1401
if self.node_ref_lists:
1402
value, node_refs = nodes[key]
1403
yield self, key, value, node_refs
1405
yield self, key, nodes[key]
1412
raise errors.BadIndexKey(key)
1413
if len(key) != self._key_length:
1414
raise errors.BadIndexKey(key)
1415
# find what it refers to:
1416
key_dict = nodes_by_key
1417
elements = list(key)
1418
# find the subdict whose contents should be returned.
1420
while len(elements) and elements[0] is not None:
1421
key_dict = key_dict[elements[0]]
1424
# a non-existant lookup.
1429
key_dict = dicts.pop(-1)
1430
# can't be empty or would not exist
1431
item, value = key_dict.iteritems().next()
1432
if type(value) == dict:
1434
dicts.extend(key_dict.itervalues())
1437
for value in key_dict.itervalues():
1438
# each value is the key:value:node refs tuple
1440
yield (self, ) + value
1442
# the last thing looked up was a terminal element
1443
yield (self, ) + key_dict
1445
def key_count(self):
1446
"""Return an estimate of the number of keys in this index.
1448
For BTreeGraphIndex the estimate is exact as it is contained in the
1451
if self._key_count is None:
1452
self._get_root_node()
1453
return self._key_count
1455
def _compute_row_offsets(self):
1456
"""Fill out the _row_offsets attribute based on _row_lengths."""
1459
for row in self._row_lengths:
1460
offsets.append(row_offset)
1462
offsets.append(row_offset)
1463
self._row_offsets = offsets
1465
def _parse_header_from_bytes(self, bytes):
1466
"""Parse the header from a region of bytes.
1468
:param bytes: The data to parse.
1469
:return: An offset, data tuple such as readv yields, for the unparsed
1470
data. (which may be of length 0).
1472
signature = bytes[0:len(self._signature())]
1473
if not signature == self._signature():
1474
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1475
lines = bytes[len(self._signature()):].splitlines()
1476
options_line = lines[0]
1477
if not options_line.startswith(_OPTION_NODE_REFS):
1478
raise errors.BadIndexOptions(self)
1480
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1482
raise errors.BadIndexOptions(self)
1483
options_line = lines[1]
1484
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1485
raise errors.BadIndexOptions(self)
1487
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1489
raise errors.BadIndexOptions(self)
1490
options_line = lines[2]
1491
if not options_line.startswith(_OPTION_LEN):
1492
raise errors.BadIndexOptions(self)
1494
self._key_count = int(options_line[len(_OPTION_LEN):])
1496
raise errors.BadIndexOptions(self)
1497
options_line = lines[3]
1498
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1499
raise errors.BadIndexOptions(self)
1501
self._row_lengths = map(int, [length for length in
1502
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1505
raise errors.BadIndexOptions(self)
1506
self._compute_row_offsets()
1508
# calculate the bytes we have processed
1509
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
1510
return header_end, bytes[header_end:]
1512
def _read_nodes(self, nodes):
1513
"""Read some nodes from disk into the LRU cache.
1515
This performs a readv to get the node data into memory, and parses each
1516
node, then yields it to the caller. The nodes are requested in the
1517
supplied order. If possible doing sort() on the list before requesting
1518
a read may improve performance.
1520
:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
1523
# may be the byte string of the whole file
1525
# list of (offset, length) regions of the file that should, evenually
1526
# be read in to data_ranges, either from 'bytes' or from the transport
1528
base_offset = self._base_offset
1530
offset = (index * _PAGE_SIZE)
1533
# Root node - special case
1535
size = min(_PAGE_SIZE, self._size)
1537
# The only case where we don't know the size, is for very
1538
# small indexes. So we read the whole thing
1539
bytes = self._transport.get_bytes(self._name)
1540
num_bytes = len(bytes)
1541
self._size = num_bytes - base_offset
1542
# the whole thing should be parsed out of 'bytes'
1543
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1544
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1547
if offset > self._size:
1548
raise AssertionError('tried to read past the end'
1549
' of the file %s > %s'
1550
% (offset, self._size))
1551
size = min(size, self._size - offset)
1552
ranges.append((base_offset + offset, size))
1555
elif bytes is not None:
1556
# already have the whole file
1557
data_ranges = [(start, bytes[start:start+size])
1558
for start, size in ranges]
1559
elif self._file is None:
1560
data_ranges = self._transport.readv(self._name, ranges)
1563
for offset, size in ranges:
1564
self._file.seek(offset)
1565
data_ranges.append((offset, self._file.read(size)))
1566
for offset, data in data_ranges:
1567
offset -= base_offset
1569
# extract the header
1570
offset, data = self._parse_header_from_bytes(data)
1573
bytes = zlib.decompress(data)
1574
if bytes.startswith(_LEAF_FLAG):
1575
node = self._leaf_factory(bytes, self._key_length,
1576
self.node_ref_lists)
1577
elif bytes.startswith(_INTERNAL_FLAG):
1578
node = _InternalNode(bytes)
1580
raise AssertionError("Unknown node type for %r" % bytes)
1581
yield offset / _PAGE_SIZE, node
1583
def _signature(self):
1584
"""The file signature for this index type."""
1588
"""Validate that everything in the index can be accessed."""
1589
# just read and parse every node.
1590
self._get_root_node()
1591
if len(self._row_lengths) > 1:
1592
start_node = self._row_offsets[1]
1594
# We shouldn't be reading anything anyway
1596
node_end = self._row_offsets[-1]
1597
for node in self._read_nodes(range(start_node, node_end)):
1601
_gcchk_factory = _LeafNode
1604
from bzrlib import _btree_serializer_pyx as _btree_serializer
1605
_gcchk_factory = _btree_serializer._parse_into_chk
1606
except ImportError, e:
1607
osutils.failed_to_load_extension(e)
1608
from bzrlib import _btree_serializer_py as _btree_serializer