1
# Copyright (C) 2008-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
from bzrlib.lazy_import import lazy_import
23
lazy_import(globals(), """
42
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
45
_BTSIGNATURE = "B+Tree Graph Index 2\n"
46
_OPTION_ROW_LENGTHS = "row_lengths="
47
_LEAF_FLAG = "type=leaf\n"
48
_INTERNAL_FLAG = "type=internal\n"
49
_INTERNAL_OFFSET = "offset="
51
_RESERVED_HEADER_BYTES = 120
54
# 4K per page: 4MB - 1000 entries
55
_NODE_CACHE_SIZE = 1000
58
class _BuilderRow(object):
59
"""The stored state accumulated while writing out a row in the index.
61
:ivar spool: A temporary file used to accumulate nodes for this row
63
:ivar nodes: The count of nodes emitted so far.
67
"""Create a _BuilderRow."""
69
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
72
def finish_node(self, pad=True):
73
byte_lines, _, padding = self.writer.finish()
75
self.spool = cStringIO.StringIO()
77
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
79
# We got bigger than 1 node, switch to a temp file
80
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
81
spool.write(self.spool.getvalue())
84
if not pad and padding:
86
skipped_bytes = padding
87
self.spool.writelines(byte_lines)
88
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
90
raise AssertionError("incorrect node length: %d, %d"
91
% (self.spool.tell(), remainder))
96
class _InternalBuilderRow(_BuilderRow):
97
"""The stored state accumulated while writing out internal rows."""
99
def finish_node(self, pad=True):
101
raise AssertionError("Must pad internal nodes only.")
102
_BuilderRow.finish_node(self)
105
class _LeafBuilderRow(_BuilderRow):
106
"""The stored state accumulated while writing out a leaf rows."""
109
class BTreeBuilder(index.GraphIndexBuilder):
110
"""A Builder for B+Tree based Graph indices.
112
The resulting graph has the structure:
114
_SIGNATURE OPTIONS NODES
115
_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
116
OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
117
REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
118
KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
119
LENGTH := 'len=' DIGITS NEWLINE
120
ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
121
NODES := NODE_COMPRESSED*
122
NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
123
NODE_RAW := INTERNAL | LEAF
124
INTERNAL := INTERNAL_FLAG POINTERS
125
LEAF := LEAF_FLAG ROWS
126
KEY_ELEMENT := Not-whitespace-utf8
127
KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
129
ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
131
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
132
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
134
VALUE := no-newline-no-null-bytes
137
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
138
"""See GraphIndexBuilder.__init__.
140
:param spill_at: Optional parameter controlling the maximum number
141
of nodes that BTreeBuilder will hold in memory.
143
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
144
key_elements=key_elements)
145
self._spill_at = spill_at
146
self._backing_indices = []
147
# A map of {key: (node_refs, value)}
149
# Indicate it hasn't been built yet
150
self._nodes_by_key = None
151
self._optimize_for_size = False
153
def add_node(self, key, value, references=()):
154
"""Add a node to the index.
156
If adding the node causes the builder to reach its spill_at threshold,
157
disk spilling will be triggered.
159
:param key: The key. keys are non-empty tuples containing
160
as many whitespace-free utf8 bytestrings as the key length
161
defined for this index.
162
:param references: An iterable of iterables of keys. Each is a
163
reference to another key.
164
:param value: The value to associate with the key. It may be any
165
bytes as long as it does not contain \0 or \n.
167
# Ensure that 'key' is a StaticTuple
168
key = static_tuple.StaticTuple.from_sequence(key).intern()
169
# we don't care about absent_references
170
node_refs, _ = self._check_key_ref_value(key, references, value)
171
if key in self._nodes:
172
raise errors.BadIndexDuplicateKey(key, self)
173
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
174
if self._nodes_by_key is not None and self._key_length > 1:
175
self._update_nodes_by_key(key, value, node_refs)
176
if len(self._nodes) < self._spill_at:
178
self._spill_mem_keys_to_disk()
180
def _spill_mem_keys_to_disk(self):
181
"""Write the in memory keys down to disk to cap memory consumption.
183
If we already have some keys written to disk, we will combine them so
184
as to preserve the sorted order. The algorithm for combining uses
185
powers of two. So on the first spill, write all mem nodes into a
186
single index. On the second spill, combine the mem nodes with the nodes
187
on disk to create a 2x sized disk index and get rid of the first index.
188
On the third spill, create a single new disk index, which will contain
189
the mem nodes, and preserve the existing 2x sized index. On the fourth,
190
combine mem with the first and second indexes, creating a new one of
191
size 4x. On the fifth create a single new one, etc.
193
if self._combine_backing_indices:
194
(new_backing_file, size,
195
backing_pos) = self._spill_mem_keys_and_combine()
197
new_backing_file, size = self._spill_mem_keys_without_combining()
198
# Note: The transport here isn't strictly needed, because we will use
199
# direct access to the new_backing._file object
200
new_backing = BTreeGraphIndex(transport.get_transport('.'),
202
# GC will clean up the file
203
new_backing._file = new_backing_file
204
if self._combine_backing_indices:
205
if len(self._backing_indices) == backing_pos:
206
self._backing_indices.append(None)
207
self._backing_indices[backing_pos] = new_backing
208
for backing_pos in range(backing_pos):
209
self._backing_indices[backing_pos] = None
211
self._backing_indices.append(new_backing)
213
self._nodes_by_key = None
215
def _spill_mem_keys_without_combining(self):
216
return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
218
def _spill_mem_keys_and_combine(self):
219
iterators_to_combine = [self._iter_mem_nodes()]
221
for pos, backing in enumerate(self._backing_indices):
225
iterators_to_combine.append(backing.iter_all_entries())
226
backing_pos = pos + 1
227
new_backing_file, size = \
228
self._write_nodes(self._iter_smallest(iterators_to_combine),
229
allow_optimize=False)
230
return new_backing_file, size, backing_pos
232
def add_nodes(self, nodes):
233
"""Add nodes to the index.
235
:param nodes: An iterable of (key, node_refs, value) entries to add.
237
if self.reference_lists:
238
for (key, value, node_refs) in nodes:
239
self.add_node(key, value, node_refs)
241
for (key, value) in nodes:
242
self.add_node(key, value)
244
def _iter_mem_nodes(self):
245
"""Iterate over the nodes held in memory."""
247
if self.reference_lists:
248
for key in sorted(nodes):
249
references, value = nodes[key]
250
yield self, key, value, references
252
for key in sorted(nodes):
253
references, value = nodes[key]
254
yield self, key, value
256
def _iter_smallest(self, iterators_to_combine):
257
if len(iterators_to_combine) == 1:
258
for value in iterators_to_combine[0]:
262
for iterator in iterators_to_combine:
264
current_values.append(iterator.next())
265
except StopIteration:
266
current_values.append(None)
269
# Decorate candidates with the value to allow 2.4's min to be used.
270
candidates = [(item[1][1], item) for item
271
in enumerate(current_values) if item[1] is not None]
272
if not len(candidates):
274
selected = min(candidates)
275
# undecorate back to (pos, node)
276
selected = selected[1]
277
if last == selected[1][1]:
278
raise errors.BadIndexDuplicateKey(last, self)
279
last = selected[1][1]
280
# Yield, with self as the index
281
yield (self,) + selected[1][1:]
284
current_values[pos] = iterators_to_combine[pos].next()
285
except StopIteration:
286
current_values[pos] = None
288
def _add_key(self, string_key, line, rows, allow_optimize=True):
289
"""Add a key to the current chunk.
291
:param string_key: The key to add.
292
:param line: The fully serialised key and value.
293
:param allow_optimize: If set to False, prevent setting the optimize
294
flag when writing out. This is used by the _spill_mem_keys_to_disk
297
if rows[-1].writer is None:
298
# opening a new leaf chunk;
299
for pos, internal_row in enumerate(rows[:-1]):
300
# flesh out any internal nodes that are needed to
301
# preserve the height of the tree
302
if internal_row.writer is None:
304
if internal_row.nodes == 0:
305
length -= _RESERVED_HEADER_BYTES # padded
307
optimize_for_size = self._optimize_for_size
309
optimize_for_size = False
310
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
311
optimize_for_size=optimize_for_size)
312
internal_row.writer.write(_INTERNAL_FLAG)
313
internal_row.writer.write(_INTERNAL_OFFSET +
314
str(rows[pos + 1].nodes) + "\n")
317
if rows[-1].nodes == 0:
318
length -= _RESERVED_HEADER_BYTES # padded
319
rows[-1].writer = chunk_writer.ChunkWriter(length,
320
optimize_for_size=self._optimize_for_size)
321
rows[-1].writer.write(_LEAF_FLAG)
322
if rows[-1].writer.write(line):
323
# this key did not fit in the node:
324
rows[-1].finish_node()
325
key_line = string_key + "\n"
327
for row in reversed(rows[:-1]):
328
# Mark the start of the next node in the node above. If it
329
# doesn't fit then propagate upwards until we find one that
331
if row.writer.write(key_line):
334
# We've found a node that can handle the pointer.
337
# If we reached the current root without being able to mark the
338
# division point, then we need a new root:
341
if 'index' in debug.debug_flags:
342
trace.mutter('Inserting new global row.')
343
new_row = _InternalBuilderRow()
345
rows.insert(0, new_row)
346
# This will be padded, hence the -100
347
new_row.writer = chunk_writer.ChunkWriter(
348
_PAGE_SIZE - _RESERVED_HEADER_BYTES,
350
optimize_for_size=self._optimize_for_size)
351
new_row.writer.write(_INTERNAL_FLAG)
352
new_row.writer.write(_INTERNAL_OFFSET +
353
str(rows[1].nodes - 1) + "\n")
354
new_row.writer.write(key_line)
355
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
357
def _write_nodes(self, node_iterator, allow_optimize=True):
358
"""Write node_iterator out as a B+Tree.
360
:param node_iterator: An iterator of sorted nodes. Each node should
361
match the output given by iter_all_entries.
362
:param allow_optimize: If set to False, prevent setting the optimize
363
flag when writing out. This is used by the _spill_mem_keys_to_disk
365
:return: A file handle for a temporary file containing a B+Tree for
368
# The index rows - rows[0] is the root, rows[1] is the layer under it
371
# forward sorted by key. In future we may consider topological sorting,
372
# at the cost of table scans for direct lookup, or a second index for
375
# A stack with the number of nodes of each size. 0 is the root node
376
# and must always be 1 (if there are any nodes in the tree).
377
self.row_lengths = []
378
# Loop over all nodes adding them to the bottom row
379
# (rows[-1]). When we finish a chunk in a row,
380
# propagate the key that didn't fit (comes after the chunk) to the
381
# row above, transitively.
382
for node in node_iterator:
384
# First key triggers the first row
385
rows.append(_LeafBuilderRow())
387
string_key, line = _btree_serializer._flatten_node(node,
388
self.reference_lists)
389
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
390
for row in reversed(rows):
391
pad = (type(row) != _LeafBuilderRow)
392
row.finish_node(pad=pad)
393
lines = [_BTSIGNATURE]
394
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
395
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
396
lines.append(_OPTION_LEN + str(key_count) + '\n')
397
row_lengths = [row.nodes for row in rows]
398
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
399
if row_lengths and row_lengths[-1] > 1:
400
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
402
result = cStringIO.StringIO()
403
result.writelines(lines)
404
position = sum(map(len, lines))
406
if position > _RESERVED_HEADER_BYTES:
407
raise AssertionError("Could not fit the header in the"
408
" reserved space: %d > %d"
409
% (position, _RESERVED_HEADER_BYTES))
410
# write the rows out:
412
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
415
# copy nodes to the finalised file.
416
# Special case the first node as it may be prefixed
417
node = row.spool.read(_PAGE_SIZE)
418
result.write(node[reserved:])
419
if len(node) == _PAGE_SIZE:
420
result.write("\x00" * (reserved - position))
421
position = 0 # Only the root row actually has an offset
422
copied_len = osutils.pumpfile(row.spool, result)
423
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
424
if type(row) != _LeafBuilderRow:
425
raise AssertionError("Incorrect amount of data copied"
426
" expected: %d, got: %d"
427
% ((row.nodes - 1) * _PAGE_SIZE,
435
"""Finalise the index.
437
:return: A file handle for a temporary file containing the nodes added
440
return self._write_nodes(self.iter_all_entries())[0]
442
def iter_all_entries(self):
443
"""Iterate over all keys within the index
445
:return: An iterable of (index, key, value, reference_lists). There is
446
no defined order for the result iteration - it will be in the most
447
efficient order for the index (in this case dictionary hash order).
449
if 'evil' in debug.debug_flags:
450
trace.mutter_callsite(3,
451
"iter_all_entries scales with size of history.")
452
# Doing serial rather than ordered would be faster; but this shouldn't
453
# be getting called routinely anyway.
454
iterators = [self._iter_mem_nodes()]
455
for backing in self._backing_indices:
456
if backing is not None:
457
iterators.append(backing.iter_all_entries())
458
if len(iterators) == 1:
460
return self._iter_smallest(iterators)
462
def iter_entries(self, keys):
463
"""Iterate over keys within the index.
465
:param keys: An iterable providing the keys to be retrieved.
466
:return: An iterable of (index, key, value, reference_lists). There is no
467
defined order for the result iteration - it will be in the most
468
efficient order for the index (keys iteration order in this case).
471
# Note: We don't use keys.intersection() here. If you read the C api,
472
# set.intersection(other) special cases when other is a set and
473
# will iterate the smaller of the two and lookup in the other.
474
# It does *not* do this for any other type (even dict, unlike
475
# some other set functions.) Since we expect keys is generally <<
476
# self._nodes, it is faster to iterate over it in a list
479
local_keys = [key for key in keys if key in nodes]
480
if self.reference_lists:
481
for key in local_keys:
483
yield self, key, node[1], node[0]
485
for key in local_keys:
487
yield self, key, node[1]
488
# Find things that are in backing indices that have not been handled
490
if not self._backing_indices:
491
return # We won't find anything there either
492
# Remove all of the keys that we found locally
493
keys.difference_update(local_keys)
494
for backing in self._backing_indices:
499
for node in backing.iter_entries(keys):
501
yield (self,) + node[1:]
503
def iter_entries_prefix(self, keys):
504
"""Iterate over keys within the index using prefix matching.
506
Prefix matching is applied within the tuple of a key, not to within
507
the bytestring of each key element. e.g. if you have the keys ('foo',
508
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
509
only the former key is returned.
511
:param keys: An iterable providing the key prefixes to be retrieved.
512
Each key prefix takes the form of a tuple the length of a key, but
513
with the last N elements 'None' rather than a regular bytestring.
514
The first element cannot be 'None'.
515
:return: An iterable as per iter_all_entries, but restricted to the
516
keys with a matching prefix to those supplied. No additional keys
517
will be returned, and every match that is in the index will be
520
# XXX: To much duplication with the GraphIndex class; consider finding
521
# a good place to pull out the actual common logic.
525
for backing in self._backing_indices:
528
for node in backing.iter_entries_prefix(keys):
529
yield (self,) + node[1:]
530
if self._key_length == 1:
534
raise errors.BadIndexKey(key)
535
if len(key) != self._key_length:
536
raise errors.BadIndexKey(key)
538
node = self._nodes[key]
541
if self.reference_lists:
542
yield self, key, node[1], node[0]
544
yield self, key, node[1]
549
raise errors.BadIndexKey(key)
550
if len(key) != self._key_length:
551
raise errors.BadIndexKey(key)
552
# find what it refers to:
553
key_dict = self._get_nodes_by_key()
555
# find the subdict to return
557
while len(elements) and elements[0] is not None:
558
key_dict = key_dict[elements[0]]
561
# a non-existant lookup.
566
key_dict = dicts.pop(-1)
567
# can't be empty or would not exist
568
item, value = key_dict.iteritems().next()
569
if type(value) == dict:
571
dicts.extend(key_dict.itervalues())
574
for value in key_dict.itervalues():
575
yield (self, ) + tuple(value)
577
yield (self, ) + key_dict
579
def _get_nodes_by_key(self):
580
if self._nodes_by_key is None:
582
if self.reference_lists:
583
for key, (references, value) in self._nodes.iteritems():
584
key_dict = nodes_by_key
585
for subkey in key[:-1]:
586
key_dict = key_dict.setdefault(subkey, {})
587
key_dict[key[-1]] = key, value, references
589
for key, (references, value) in self._nodes.iteritems():
590
key_dict = nodes_by_key
591
for subkey in key[:-1]:
592
key_dict = key_dict.setdefault(subkey, {})
593
key_dict[key[-1]] = key, value
594
self._nodes_by_key = nodes_by_key
595
return self._nodes_by_key
598
"""Return an estimate of the number of keys in this index.
600
For InMemoryGraphIndex the estimate is exact.
602
return len(self._nodes) + sum(backing.key_count() for backing in
603
self._backing_indices if backing is not None)
606
"""In memory index's have no known corruption at the moment."""
609
class _LeafNode(dict):
610
"""A leaf node for a serialised B+Tree index."""
612
__slots__ = ('min_key', 'max_key', '_keys')
614
def __init__(self, bytes, key_length, ref_list_length):
615
"""Parse bytes to create a leaf node object."""
616
# splitlines mangles the \r delimiters.. don't use it.
617
key_list = _btree_serializer._parse_leaf_lines(bytes,
618
key_length, ref_list_length)
620
self.min_key = key_list[0][0]
621
self.max_key = key_list[-1][0]
623
self.min_key = self.max_key = None
624
super(_LeafNode, self).__init__(key_list)
625
self._keys = dict(self)
628
"""Return a sorted list of (key, (value, refs)) items"""
634
"""Return a sorted list of all keys."""
640
class _InternalNode(object):
641
"""An internal node for a serialised B+Tree index."""
643
__slots__ = ('keys', 'offset')
645
def __init__(self, bytes):
646
"""Parse bytes to create an internal node object."""
647
# splitlines mangles the \r delimiters.. don't use it.
648
self.keys = self._parse_lines(bytes.split('\n'))
650
def _parse_lines(self, lines):
652
self.offset = int(lines[1][7:])
653
as_st = static_tuple.StaticTuple.from_sequence
654
for line in lines[2:]:
657
nodes.append(as_st(map(intern, line.split('\0'))).intern())
661
class BTreeGraphIndex(object):
662
"""Access to nodes via the standard GraphIndex interface for B+Tree's.
664
Individual nodes are held in a LRU cache. This holds the root node in
665
memory except when very large walks are done.
668
def __init__(self, transport, name, size, unlimited_cache=False,
670
"""Create a B+Tree index object on the index name.
672
:param transport: The transport to read data for the index from.
673
:param name: The file name of the index on transport.
674
:param size: Optional size of the index in bytes. This allows
675
compatibility with the GraphIndex API, as well as ensuring that
676
the initial read (to read the root node header) can be done
677
without over-reading even on empty indices, and on small indices
678
allows single-IO to read the entire index.
679
:param unlimited_cache: If set to True, then instead of using an
680
LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
681
cache all leaf nodes.
682
:param offset: The start of the btree index data isn't byte 0 of the
683
file. Instead it starts at some point later.
685
self._transport = transport
689
self._recommended_pages = self._compute_recommended_pages()
690
self._root_node = None
691
self._base_offset = offset
692
self._leaf_factory = _LeafNode
693
# Default max size is 100,000 leave values
694
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
696
self._leaf_node_cache = {}
697
self._internal_node_cache = {}
699
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
700
# We use a FIFO here just to prevent possible blowout. However, a
701
# 300k record btree has only 3k leaf nodes, and only 20 internal
702
# nodes. A value of 100 scales to ~100*100*100 = 1M records.
703
self._internal_node_cache = fifo_cache.FIFOCache(100)
704
self._key_count = None
705
self._row_lengths = None
706
self._row_offsets = None # Start of each row, [-1] is the end
708
def __eq__(self, other):
709
"""Equal when self and other were created with the same parameters."""
711
type(self) == type(other) and
712
self._transport == other._transport and
713
self._name == other._name and
714
self._size == other._size)
716
def __ne__(self, other):
717
return not self.__eq__(other)
719
def _get_and_cache_nodes(self, nodes):
720
"""Read nodes and cache them in the lru.
722
The nodes list supplied is sorted and then read from disk, each node
723
being inserted it into the _node_cache.
725
Note: Asking for more nodes than the _node_cache can contain will
726
result in some of the results being immediately discarded, to prevent
727
this an assertion is raised if more nodes are asked for than are
730
:return: A dict of {node_pos: node}
733
start_of_leaves = None
734
for node_pos, node in self._read_nodes(sorted(nodes)):
735
if node_pos == 0: # Special case
736
self._root_node = node
738
if start_of_leaves is None:
739
start_of_leaves = self._row_offsets[-2]
740
if node_pos < start_of_leaves:
741
self._internal_node_cache[node_pos] = node
743
self._leaf_node_cache[node_pos] = node
744
found[node_pos] = node
747
def _compute_recommended_pages(self):
748
"""Convert transport's recommended_page_size into btree pages.
750
recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
751
pages fit in that length.
753
recommended_read = self._transport.recommended_page_size()
754
recommended_pages = int(math.ceil(recommended_read /
756
return recommended_pages
758
def _compute_total_pages_in_index(self):
759
"""How many pages are in the index.
761
If we have read the header we will use the value stored there.
762
Otherwise it will be computed based on the length of the index.
764
if self._size is None:
765
raise AssertionError('_compute_total_pages_in_index should not be'
766
' called when self._size is None')
767
if self._root_node is not None:
768
# This is the number of pages as defined by the header
769
return self._row_offsets[-1]
770
# This is the number of pages as defined by the size of the index. They
771
# should be indentical.
772
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
775
def _expand_offsets(self, offsets):
776
"""Find extra pages to download.
778
The idea is that we always want to make big-enough requests (like 64kB
779
for http), so that we don't waste round trips. So given the entries
780
that we already have cached and the new pages being downloaded figure
781
out what other pages we might want to read.
783
See also doc/developers/btree_index_prefetch.txt for more details.
785
:param offsets: The offsets to be read
786
:return: A list of offsets to download
788
if 'index' in debug.debug_flags:
789
trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
791
if len(offsets) >= self._recommended_pages:
792
# Don't add more, we are already requesting more than enough
793
if 'index' in debug.debug_flags:
794
trace.mutter(' not expanding large request (%s >= %s)',
795
len(offsets), self._recommended_pages)
797
if self._size is None:
798
# Don't try anything, because we don't know where the file ends
799
if 'index' in debug.debug_flags:
800
trace.mutter(' not expanding without knowing index size')
802
total_pages = self._compute_total_pages_in_index()
803
cached_offsets = self._get_offsets_to_cached_pages()
804
# If reading recommended_pages would read the rest of the index, just
806
if total_pages - len(cached_offsets) <= self._recommended_pages:
807
# Read whatever is left
809
expanded = [x for x in xrange(total_pages)
810
if x not in cached_offsets]
812
expanded = range(total_pages)
813
if 'index' in debug.debug_flags:
814
trace.mutter(' reading all unread pages: %s', expanded)
817
if self._root_node is None:
818
# ATM on the first read of the root node of a large index, we don't
819
# bother pre-reading any other pages. This is because the
820
# likelyhood of actually reading interesting pages is very low.
821
# See doc/developers/btree_index_prefetch.txt for a discussion, and
822
# a possible implementation when we are guessing that the second
823
# layer index is small
824
final_offsets = offsets
826
tree_depth = len(self._row_lengths)
827
if len(cached_offsets) < tree_depth and len(offsets) == 1:
828
# We haven't read enough to justify expansion
829
# If we are only going to read the root node, and 1 leaf node,
830
# then it isn't worth expanding our request. Once we've read at
831
# least 2 nodes, then we are probably doing a search, and we
832
# start expanding our requests.
833
if 'index' in debug.debug_flags:
834
trace.mutter(' not expanding on first reads')
836
final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
839
final_offsets = sorted(final_offsets)
840
if 'index' in debug.debug_flags:
841
trace.mutter('expanded: %s', final_offsets)
844
def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
845
"""Expand requests to neighbors until we have enough pages.
847
This is called from _expand_offsets after policy has determined that we
849
We only want to expand requests within a given layer. We cheat a little
850
bit and assume all requests will be in the same layer. This is true
851
given the current design, but if it changes this algorithm may perform
854
:param offsets: requested offsets
855
:param cached_offsets: offsets for pages we currently have cached
856
:return: A set() of offsets after expansion
858
final_offsets = set(offsets)
860
new_tips = set(final_offsets)
861
while len(final_offsets) < self._recommended_pages and new_tips:
865
first, end = self._find_layer_first_and_end(pos)
868
and previous not in cached_offsets
869
and previous not in final_offsets
870
and previous >= first):
871
next_tips.add(previous)
873
if (after < total_pages
874
and after not in cached_offsets
875
and after not in final_offsets
878
# This would keep us from going bigger than
879
# recommended_pages by only expanding the first offsets.
880
# However, if we are making a 'wide' request, it is
881
# reasonable to expand all points equally.
882
# if len(final_offsets) > recommended_pages:
884
final_offsets.update(next_tips)
888
def clear_cache(self):
889
"""Clear out any cached/memoized values.
891
This can be called at any time, but generally it is used when we have
892
extracted some information, but don't expect to be requesting any more
895
# Note that we don't touch self._root_node or self._internal_node_cache
896
# We don't expect either of those to be big, and it can save
897
# round-trips in the future. We may re-evaluate this if InternalNode
898
# memory starts to be an issue.
899
self._leaf_node_cache.clear()
901
def external_references(self, ref_list_num):
902
if self._root_node is None:
903
self._get_root_node()
904
if ref_list_num + 1 > self.node_ref_lists:
905
raise ValueError('No ref list %d, index has %d ref lists'
906
% (ref_list_num, self.node_ref_lists))
909
for node in self.iter_all_entries():
911
refs.update(node[3][ref_list_num])
914
def _find_layer_first_and_end(self, offset):
915
"""Find the start/stop nodes for the layer corresponding to offset.
917
:return: (first, end)
918
first is the first node in this layer
919
end is the first node of the next layer
922
for roffset in self._row_offsets:
929
def _get_offsets_to_cached_pages(self):
930
"""Determine what nodes we already have cached."""
931
cached_offsets = set(self._internal_node_cache.keys())
932
cached_offsets.update(self._leaf_node_cache.keys())
933
if self._root_node is not None:
934
cached_offsets.add(0)
935
return cached_offsets
937
def _get_root_node(self):
938
if self._root_node is None:
939
# We may not have a root node yet
940
self._get_internal_nodes([0])
941
return self._root_node
943
def _get_nodes(self, cache, node_indexes):
946
for idx in node_indexes:
947
if idx == 0 and self._root_node is not None:
948
found[0] = self._root_node
951
found[idx] = cache[idx]
956
needed = self._expand_offsets(needed)
957
found.update(self._get_and_cache_nodes(needed))
960
def _get_internal_nodes(self, node_indexes):
961
"""Get a node, from cache or disk.
963
After getting it, the node will be cached.
965
return self._get_nodes(self._internal_node_cache, node_indexes)
967
def _cache_leaf_values(self, nodes):
968
"""Cache directly from key => value, skipping the btree."""
969
if self._leaf_value_cache is not None:
970
for node in nodes.itervalues():
971
for key, value in node.all_items():
972
if key in self._leaf_value_cache:
973
# Don't add the rest of the keys, we've seen this node
976
self._leaf_value_cache[key] = value
978
def _get_leaf_nodes(self, node_indexes):
979
"""Get a bunch of nodes, from cache or disk."""
980
found = self._get_nodes(self._leaf_node_cache, node_indexes)
981
self._cache_leaf_values(found)
984
def iter_all_entries(self):
985
"""Iterate over all keys within the index.
987
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
988
The former tuple is used when there are no reference lists in the
989
index, making the API compatible with simple key:value index types.
990
There is no defined order for the result iteration - it will be in
991
the most efficient order for the index.
993
if 'evil' in debug.debug_flags:
994
trace.mutter_callsite(3,
995
"iter_all_entries scales with size of history.")
996
if not self.key_count():
998
if self._row_offsets[-1] == 1:
999
# There is only the root node, and we read that via key_count()
1000
if self.node_ref_lists:
1001
for key, (value, refs) in self._root_node.all_items():
1002
yield (self, key, value, refs)
1004
for key, (value, refs) in self._root_node.all_items():
1005
yield (self, key, value)
1007
start_of_leaves = self._row_offsets[-2]
1008
end_of_leaves = self._row_offsets[-1]
1009
needed_offsets = range(start_of_leaves, end_of_leaves)
1010
if needed_offsets == [0]:
1011
# Special case when we only have a root node, as we have already
1013
nodes = [(0, self._root_node)]
1015
nodes = self._read_nodes(needed_offsets)
1016
# We iterate strictly in-order so that we can use this function
1017
# for spilling index builds to disk.
1018
if self.node_ref_lists:
1019
for _, node in nodes:
1020
for key, (value, refs) in node.all_items():
1021
yield (self, key, value, refs)
1023
for _, node in nodes:
1024
for key, (value, refs) in node.all_items():
1025
yield (self, key, value)
1028
def _multi_bisect_right(in_keys, fixed_keys):
1029
"""Find the positions where each 'in_key' would fit in fixed_keys.
1031
This is equivalent to doing "bisect_right" on each in_key into
1034
:param in_keys: A sorted list of keys to match with fixed_keys
1035
:param fixed_keys: A sorted list of keys to match against
1036
:return: A list of (integer position, [key list]) tuples.
1041
# no pointers in the fixed_keys list, which means everything must
1043
return [(0, in_keys)]
1045
# TODO: Iterating both lists will generally take M + N steps
1046
# Bisecting each key will generally take M * log2 N steps.
1047
# If we had an efficient way to compare, we could pick the method
1048
# based on which has the fewer number of steps.
1049
# There is also the argument that bisect_right is a compiled
1050
# function, so there is even more to be gained.
1051
# iter_steps = len(in_keys) + len(fixed_keys)
1052
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1053
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1054
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1055
# elif bisect_steps < iter_steps:
1057
# for key in in_keys:
1058
# offsets.setdefault(bisect_right(fixed_keys, key),
1060
# return [(o, offsets[o]) for o in sorted(offsets)]
1061
in_keys_iter = iter(in_keys)
1062
fixed_keys_iter = enumerate(fixed_keys)
1063
cur_in_key = in_keys_iter.next()
1064
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1066
class InputDone(Exception): pass
1067
class FixedDone(Exception): pass
1072
# TODO: Another possibility is that rather than iterating on each side,
1073
# we could use a combination of bisecting and iterating. For
1074
# example, while cur_in_key < fixed_key, bisect to find its
1075
# point, then iterate all matching keys, then bisect (restricted
1076
# to only the remainder) for the next one, etc.
1079
if cur_in_key < cur_fixed_key:
1081
cur_out = (cur_fixed_offset, cur_keys)
1082
output.append(cur_out)
1083
while cur_in_key < cur_fixed_key:
1084
cur_keys.append(cur_in_key)
1086
cur_in_key = in_keys_iter.next()
1087
except StopIteration:
1089
# At this point cur_in_key must be >= cur_fixed_key
1090
# step the cur_fixed_key until we pass the cur key, or walk off
1092
while cur_in_key >= cur_fixed_key:
1094
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1095
except StopIteration:
1098
# We consumed all of the input, nothing more to do
1101
# There was some input left, but we consumed all of fixed, so we
1102
# have to add one more for the tail
1103
cur_keys = [cur_in_key]
1104
cur_keys.extend(in_keys_iter)
1105
cur_out = (len(fixed_keys), cur_keys)
1106
output.append(cur_out)
1109
def _walk_through_internal_nodes(self, keys):
1110
"""Take the given set of keys, and find the corresponding LeafNodes.
1112
:param keys: An unsorted iterable of keys to search for
1113
:return: (nodes, index_and_keys)
1114
nodes is a dict mapping {index: LeafNode}
1115
keys_at_index is a list of tuples of [(index, [keys for Leaf])]
1117
# 6 seconds spent in miss_torture using the sorted() line.
1118
# Even with out of order disk IO it seems faster not to sort it when
1119
# large queries are being made.
1120
keys_at_index = [(0, sorted(keys))]
1122
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1123
node_indexes = [idx for idx, s_keys in keys_at_index]
1124
nodes = self._get_internal_nodes(node_indexes)
1126
next_nodes_and_keys = []
1127
for node_index, sub_keys in keys_at_index:
1128
node = nodes[node_index]
1129
positions = self._multi_bisect_right(sub_keys, node.keys)
1130
node_offset = next_row_start + node.offset
1131
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1132
for pos, s_keys in positions])
1133
keys_at_index = next_nodes_and_keys
1134
# We should now be at the _LeafNodes
1135
node_indexes = [idx for idx, s_keys in keys_at_index]
1137
# TODO: We may *not* want to always read all the nodes in one
1138
# big go. Consider setting a max size on this.
1139
nodes = self._get_leaf_nodes(node_indexes)
1140
return nodes, keys_at_index
1142
def iter_entries(self, keys):
1143
"""Iterate over keys within the index.
1145
:param keys: An iterable providing the keys to be retrieved.
1146
:return: An iterable as per iter_all_entries, but restricted to the
1147
keys supplied. No additional keys will be returned, and every
1148
key supplied that is in the index will be returned.
1150
# 6 seconds spent in miss_torture using the sorted() line.
1151
# Even with out of order disk IO it seems faster not to sort it when
1152
# large queries are being made.
1153
# However, now that we are doing multi-way bisecting, we need the keys
1154
# in sorted order anyway. We could change the multi-way code to not
1155
# require sorted order. (For example, it bisects for the first node,
1156
# does an in-order search until a key comes before the current point,
1157
# which it then bisects for, etc.)
1158
keys = frozenset(keys)
1162
if not self.key_count():
1166
if self._leaf_value_cache is None:
1170
value = self._leaf_value_cache.get(key, None)
1171
if value is not None:
1172
# This key is known not to be here, skip it
1174
if self.node_ref_lists:
1175
yield (self, key, value, refs)
1177
yield (self, key, value)
1179
needed_keys.append(key)
1185
nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
1186
for node_index, sub_keys in nodes_and_keys:
1189
node = nodes[node_index]
1190
for next_sub_key in sub_keys:
1191
if next_sub_key in node:
1192
value, refs = node[next_sub_key]
1193
if self.node_ref_lists:
1194
yield (self, next_sub_key, value, refs)
1196
yield (self, next_sub_key, value)
1198
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
1199
"""Find the parent_map information for the set of keys.
1201
This populates the parent_map dict and missing_keys set based on the
1202
queried keys. It also can fill out an arbitrary number of parents that
1203
it finds while searching for the supplied keys.
1205
It is unlikely that you want to call this directly. See
1206
"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
1208
:param keys: A keys whose ancestry we want to return
1209
Every key will either end up in 'parent_map' or 'missing_keys'.
1210
:param ref_list_num: This index in the ref_lists is the parents we
1212
:param parent_map: {key: parent_keys} for keys that are present in this
1213
index. This may contain more entries than were in 'keys', that are
1214
reachable ancestors of the keys requested.
1215
:param missing_keys: keys which are known to be missing in this index.
1216
This may include parents that were not directly requested, but we
1217
were able to determine that they are not present in this index.
1218
:return: search_keys parents that were found but not queried to know
1219
if they are missing or present. Callers can re-query this index for
1220
those keys, and they will be placed into parent_map or missing_keys
1222
if not self.key_count():
1223
# We use key_count() to trigger reading the root node and
1224
# determining info about this BTreeGraphIndex
1225
# If we don't have any keys, then everything is missing
1226
missing_keys.update(keys)
1228
if ref_list_num >= self.node_ref_lists:
1229
raise ValueError('No ref list %d, index has %d ref lists'
1230
% (ref_list_num, self.node_ref_lists))
1232
# The main trick we are trying to accomplish is that when we find a
1233
# key listing its parents, we expect that the parent key is also likely
1234
# to sit on the same page. Allowing us to expand parents quickly
1235
# without suffering the full stack of bisecting, etc.
1236
nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
1238
# These are parent keys which could not be immediately resolved on the
1239
# page where the child was present. Note that we may already be
1240
# searching for that key, and it may actually be present [or known
1241
# missing] on one of the other pages we are reading.
1243
# We could try searching for them in the immediate previous or next
1244
# page. If they occur "later" we could put them in a pending lookup
1245
# set, and then for each node we read thereafter we could check to
1246
# see if they are present.
1247
# However, we don't know the impact of keeping this list of things
1248
# that I'm going to search for every node I come across from here on
1250
# It doesn't handle the case when the parent key is missing on a
1251
# page that we *don't* read. So we already have to handle being
1252
# re-entrant for that.
1253
# Since most keys contain a date string, they are more likely to be
1254
# found earlier in the file than later, but we would know that right
1255
# away (key < min_key), and wouldn't keep searching it on every other
1256
# page that we read.
1257
# Mostly, it is an idea, one which should be benchmarked.
1258
parents_not_on_page = set()
1260
for node_index, sub_keys in nodes_and_keys:
1263
# sub_keys is all of the keys we are looking for that should exist
1264
# on this page, if they aren't here, then they won't be found
1265
node = nodes[node_index]
1266
parents_to_check = set()
1267
for next_sub_key in sub_keys:
1268
if next_sub_key not in node:
1269
# This one is just not present in the index at all
1270
missing_keys.add(next_sub_key)
1272
value, refs = node[next_sub_key]
1273
parent_keys = refs[ref_list_num]
1274
parent_map[next_sub_key] = parent_keys
1275
parents_to_check.update(parent_keys)
1276
# Don't look for things we've already found
1277
parents_to_check = parents_to_check.difference(parent_map)
1278
# this can be used to test the benefit of having the check loop
1280
# parents_not_on_page.update(parents_to_check)
1282
while parents_to_check:
1283
next_parents_to_check = set()
1284
for key in parents_to_check:
1286
value, refs = node[key]
1287
parent_keys = refs[ref_list_num]
1288
parent_map[key] = parent_keys
1289
next_parents_to_check.update(parent_keys)
1291
# This parent either is genuinely missing, or should be
1292
# found on another page. Perf test whether it is better
1293
# to check if this node should fit on this page or not.
1294
# in the 'everything-in-one-pack' scenario, this *not*
1295
# doing the check is 237ms vs 243ms.
1296
# So slightly better, but I assume the standard 'lots
1297
# of packs' is going to show a reasonable improvement
1298
# from the check, because it avoids 'going around
1299
# again' for everything that is in another index
1300
# parents_not_on_page.add(key)
1301
# Missing for some reason
1302
if key < node.min_key:
1303
# in the case of bzr.dev, 3.4k/5.3k misses are
1304
# 'earlier' misses (65%)
1305
parents_not_on_page.add(key)
1306
elif key > node.max_key:
1307
# This parent key would be present on a different
1309
parents_not_on_page.add(key)
1311
# assert key != node.min_key and key != node.max_key
1312
# If it was going to be present, it would be on
1313
# *this* page, so mark it missing.
1314
missing_keys.add(key)
1315
parents_to_check = next_parents_to_check.difference(parent_map)
1316
# Might want to do another .difference() from missing_keys
1317
# parents_not_on_page could have been found on a different page, or be
1318
# known to be missing. So cull out everything that has already been
1320
search_keys = parents_not_on_page.difference(
1321
parent_map).difference(missing_keys)
1324
def iter_entries_prefix(self, keys):
1325
"""Iterate over keys within the index using prefix matching.
1327
Prefix matching is applied within the tuple of a key, not to within
1328
the bytestring of each key element. e.g. if you have the keys ('foo',
1329
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1330
only the former key is returned.
1332
WARNING: Note that this method currently causes a full index parse
1333
unconditionally (which is reasonably appropriate as it is a means for
1334
thunking many small indices into one larger one and still supplies
1335
iter_all_entries at the thunk layer).
1337
:param keys: An iterable providing the key prefixes to be retrieved.
1338
Each key prefix takes the form of a tuple the length of a key, but
1339
with the last N elements 'None' rather than a regular bytestring.
1340
The first element cannot be 'None'.
1341
:return: An iterable as per iter_all_entries, but restricted to the
1342
keys with a matching prefix to those supplied. No additional keys
1343
will be returned, and every match that is in the index will be
1346
keys = sorted(set(keys))
1349
# Load if needed to check key lengths
1350
if self._key_count is None:
1351
self._get_root_node()
1352
# TODO: only access nodes that can satisfy the prefixes we are looking
1353
# for. For now, to meet API usage (as this function is not used by
1354
# current bzrlib) just suck the entire index and iterate in memory.
1356
if self.node_ref_lists:
1357
if self._key_length == 1:
1358
for _1, key, value, refs in self.iter_all_entries():
1359
nodes[key] = value, refs
1362
for _1, key, value, refs in self.iter_all_entries():
1363
key_value = key, value, refs
1364
# For a key of (foo, bar, baz) create
1365
# _nodes_by_key[foo][bar][baz] = key_value
1366
key_dict = nodes_by_key
1367
for subkey in key[:-1]:
1368
key_dict = key_dict.setdefault(subkey, {})
1369
key_dict[key[-1]] = key_value
1371
if self._key_length == 1:
1372
for _1, key, value in self.iter_all_entries():
1376
for _1, key, value in self.iter_all_entries():
1377
key_value = key, value
1378
# For a key of (foo, bar, baz) create
1379
# _nodes_by_key[foo][bar][baz] = key_value
1380
key_dict = nodes_by_key
1381
for subkey in key[:-1]:
1382
key_dict = key_dict.setdefault(subkey, {})
1383
key_dict[key[-1]] = key_value
1384
if self._key_length == 1:
1388
raise errors.BadIndexKey(key)
1389
if len(key) != self._key_length:
1390
raise errors.BadIndexKey(key)
1392
if self.node_ref_lists:
1393
value, node_refs = nodes[key]
1394
yield self, key, value, node_refs
1396
yield self, key, nodes[key]
1403
raise errors.BadIndexKey(key)
1404
if len(key) != self._key_length:
1405
raise errors.BadIndexKey(key)
1406
# find what it refers to:
1407
key_dict = nodes_by_key
1408
elements = list(key)
1409
# find the subdict whose contents should be returned.
1411
while len(elements) and elements[0] is not None:
1412
key_dict = key_dict[elements[0]]
1415
# a non-existant lookup.
1420
key_dict = dicts.pop(-1)
1421
# can't be empty or would not exist
1422
item, value = key_dict.iteritems().next()
1423
if type(value) == dict:
1425
dicts.extend(key_dict.itervalues())
1428
for value in key_dict.itervalues():
1429
# each value is the key:value:node refs tuple
1431
yield (self, ) + value
1433
# the last thing looked up was a terminal element
1434
yield (self, ) + key_dict
1436
def key_count(self):
1437
"""Return an estimate of the number of keys in this index.
1439
For BTreeGraphIndex the estimate is exact as it is contained in the
1442
if self._key_count is None:
1443
self._get_root_node()
1444
return self._key_count
1446
def _compute_row_offsets(self):
1447
"""Fill out the _row_offsets attribute based on _row_lengths."""
1450
for row in self._row_lengths:
1451
offsets.append(row_offset)
1453
offsets.append(row_offset)
1454
self._row_offsets = offsets
1456
def _parse_header_from_bytes(self, bytes):
1457
"""Parse the header from a region of bytes.
1459
:param bytes: The data to parse.
1460
:return: An offset, data tuple such as readv yields, for the unparsed
1461
data. (which may be of length 0).
1463
signature = bytes[0:len(self._signature())]
1464
if not signature == self._signature():
1465
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1466
lines = bytes[len(self._signature()):].splitlines()
1467
options_line = lines[0]
1468
if not options_line.startswith(_OPTION_NODE_REFS):
1469
raise errors.BadIndexOptions(self)
1471
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1473
raise errors.BadIndexOptions(self)
1474
options_line = lines[1]
1475
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1476
raise errors.BadIndexOptions(self)
1478
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1480
raise errors.BadIndexOptions(self)
1481
options_line = lines[2]
1482
if not options_line.startswith(_OPTION_LEN):
1483
raise errors.BadIndexOptions(self)
1485
self._key_count = int(options_line[len(_OPTION_LEN):])
1487
raise errors.BadIndexOptions(self)
1488
options_line = lines[3]
1489
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1490
raise errors.BadIndexOptions(self)
1492
self._row_lengths = map(int, [length for length in
1493
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1496
raise errors.BadIndexOptions(self)
1497
self._compute_row_offsets()
1499
# calculate the bytes we have processed
1500
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
1501
return header_end, bytes[header_end:]
1503
def _read_nodes(self, nodes):
1504
"""Read some nodes from disk into the LRU cache.
1506
This performs a readv to get the node data into memory, and parses each
1507
node, then yields it to the caller. The nodes are requested in the
1508
supplied order. If possible doing sort() on the list before requesting
1509
a read may improve performance.
1511
:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
1514
# may be the byte string of the whole file
1516
# list of (offset, length) regions of the file that should, evenually
1517
# be read in to data_ranges, either from 'bytes' or from the transport
1519
base_offset = self._base_offset
1521
offset = (index * _PAGE_SIZE)
1524
# Root node - special case
1526
size = min(_PAGE_SIZE, self._size)
1528
# The only case where we don't know the size, is for very
1529
# small indexes. So we read the whole thing
1530
bytes = self._transport.get_bytes(self._name)
1531
num_bytes = len(bytes)
1532
self._size = num_bytes - base_offset
1533
# the whole thing should be parsed out of 'bytes'
1534
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1535
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1538
if offset > self._size:
1539
raise AssertionError('tried to read past the end'
1540
' of the file %s > %s'
1541
% (offset, self._size))
1542
size = min(size, self._size - offset)
1543
ranges.append((base_offset + offset, size))
1546
elif bytes is not None:
1547
# already have the whole file
1548
data_ranges = [(start, bytes[start:start+size])
1549
for start, size in ranges]
1550
elif self._file is None:
1551
data_ranges = self._transport.readv(self._name, ranges)
1554
for offset, size in ranges:
1555
self._file.seek(offset)
1556
data_ranges.append((offset, self._file.read(size)))
1557
for offset, data in data_ranges:
1558
offset -= base_offset
1560
# extract the header
1561
offset, data = self._parse_header_from_bytes(data)
1564
bytes = zlib.decompress(data)
1565
if bytes.startswith(_LEAF_FLAG):
1566
node = self._leaf_factory(bytes, self._key_length,
1567
self.node_ref_lists)
1568
elif bytes.startswith(_INTERNAL_FLAG):
1569
node = _InternalNode(bytes)
1571
raise AssertionError("Unknown node type for %r" % bytes)
1572
yield offset / _PAGE_SIZE, node
1574
def _signature(self):
1575
"""The file signature for this index type."""
1579
"""Validate that everything in the index can be accessed."""
1580
# just read and parse every node.
1581
self._get_root_node()
1582
if len(self._row_lengths) > 1:
1583
start_node = self._row_offsets[1]
1585
# We shouldn't be reading anything anyway
1587
node_end = self._row_offsets[-1]
1588
for node in self._read_nodes(range(start_node, node_end)):
1592
_gcchk_factory = _LeafNode
1595
from bzrlib import _btree_serializer_pyx as _btree_serializer
1596
_gcchk_factory = _btree_serializer._parse_into_chk
1597
except ImportError, e:
1598
osutils.failed_to_load_extension(e)
1599
from bzrlib import _btree_serializer_py as _btree_serializer