~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 4728)

3641.3.29 by John Arbash Meinel Cleanup the copyright headers	1	# Copyright (C) 2008 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	21	from bisect import bisect_right
	22	import math
	23	import tempfile
	24	import zlib
	25
	26	from bzrlib import (
	27	chunk_writer,
	28	debug,
	29	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	30	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31	index,
	32	lru_cache,
	33	osutils,
	34	trace,
	35	)
	36	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	37	from bzrlib.transport import get_transport
	38
	39
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	40	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	_OPTION_ROW_LENGTHS = "row_lengths="
	42	_LEAF_FLAG = "type=leaf\n"
	43	_INTERNAL_FLAG = "type=internal\n"
	44	_INTERNAL_OFFSET = "offset="
	45
	46	_RESERVED_HEADER_BYTES = 120
	47	_PAGE_SIZE = 4096
	48
	49	# 4K per page: 4MB - 1000 entries
	50	_NODE_CACHE_SIZE = 1000
	51
	52
	53	class _BuilderRow(object):
	54	"""The stored state accumulated while writing out a row in the index.
	55
	56	:ivar spool: A temporary file used to accumulate nodes for this row
	57	in the tree.
	58	:ivar nodes: The count of nodes emitted so far.
	59	"""
	60
	61	def __init__(self):
	62	"""Create a _BuilderRow."""
	63	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	64	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	65	self.writer = None
	66
	67	def finish_node(self, pad=True):
	68	byte_lines, _, padding = self.writer.finish()
	69	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	70	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	71	# padded note:
	72	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	73	elif self.nodes == 1:
	74	# We got bigger than 1 node, switch to a temp file
	75	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	76	spool.write(self.spool.getvalue())
	77	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	skipped_bytes = 0
	79	if not pad and padding:
	80	del byte_lines[-1]
	81	skipped_bytes = padding
	82	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	83	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	84	if remainder != 0:
	85	raise AssertionError("incorrect node length: %d, %d"
	86	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	87	self.nodes += 1
	88	self.writer = None
	89
	90
	91	class _InternalBuilderRow(_BuilderRow):
	92	"""The stored state accumulated while writing out internal rows."""
	93
	94	def finish_node(self, pad=True):
	95	if not pad:
	96	raise AssertionError("Must pad internal nodes only.")
	97	_BuilderRow.finish_node(self)
	98
	99
	100	class _LeafBuilderRow(_BuilderRow):
	101	"""The stored state accumulated while writing out a leaf rows."""
	102
	103
	104	class BTreeBuilder(index.GraphIndexBuilder):
	105	"""A Builder for B+Tree based Graph indices.
	106
	107	The resulting graph has the structure:
	108
	109	_SIGNATURE OPTIONS NODES
	110	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	111	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	112	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	113	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	114	LENGTH := 'len=' DIGITS NEWLINE
	115	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	116	NODES := NODE_COMPRESSED*
	117	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	118	NODE_RAW := INTERNAL \| LEAF
	119	INTERNAL := INTERNAL_FLAG POINTERS
	120	LEAF := LEAF_FLAG ROWS
	121	KEY_ELEMENT := Not-whitespace-utf8
	122	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	123	ROWS := ROW*
	124	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	125	ABSENT := 'a'
	126	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	127	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	128	REFERENCE := KEY
	129	VALUE := no-newline-no-null-bytes
	130	"""
	131
	132	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	133	"""See GraphIndexBuilder.__init__.
	134
	135	:param spill_at: Optional parameter controlling the maximum number
	136	of nodes that BTreeBuilder will hold in memory.
	137	"""
	138	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	139	key_elements=key_elements)
	140	self._spill_at = spill_at
	141	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	142	# A map of {key: (node_refs, value)}
	143	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	144	# Indicate it hasn't been built yet
	145	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	146	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	147
	148	def add_node(self, key, value, references=()):
	149	"""Add a node to the index.
	150
	151	If adding the node causes the builder to reach its spill_at threshold,
	152	disk spilling will be triggered.
	153
	154	:param key: The key. keys are non-empty tuples containing
	155	as many whitespace-free utf8 bytestrings as the key length
	156	defined for this index.
	157	:param references: An iterable of iterables of keys. Each is a
	158	reference to another key.
	159	:param value: The value to associate with the key. It may be any
	160	bytes as long as it does not contain \0 or \n.
	161	"""
3644.2.9 by John Arbash Meinel Refactor some code.	162	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	163	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	164	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	165	raise errors.BadIndexDuplicateKey(key, self)
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	166	self._nodes[key] = (node_refs, value)
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	167	self._keys.add(key)
3644.2.9 by John Arbash Meinel Refactor some code.	168	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	169	self._update_nodes_by_key(key, value, node_refs)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	170	if len(self._keys) < self._spill_at:
	171	return
3644.2.9 by John Arbash Meinel Refactor some code.	172	self._spill_mem_keys_to_disk()
	173
	174	def _spill_mem_keys_to_disk(self):
	175	"""Write the in memory keys down to disk to cap memory consumption.
	176
	177	If we already have some keys written to disk, we will combine them so
	178	as to preserve the sorted order. The algorithm for combining uses
	179	powers of two. So on the first spill, write all mem nodes into a
	180	single index. On the second spill, combine the mem nodes with the nodes
	181	on disk to create a 2x sized disk index and get rid of the first index.
	182	On the third spill, create a single new disk index, which will contain
	183	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	184	combine mem with the first and second indexes, creating a new one of
	185	size 4x. On the fifth create a single new one, etc.
	186	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	187	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	188	(new_backing_file, size,
	189	backing_pos) = self._spill_mem_keys_and_combine()
	190	else:
	191	new_backing_file, size = self._spill_mem_keys_without_combining()
	192	# Note: The transport here isn't strictly needed, because we will use
	193	# direct access to the new_backing._file object
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	194	new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	195	# GC will clean up the file
	196	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	197	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	198	if len(self._backing_indices) == backing_pos:
	199	self._backing_indices.append(None)
	200	self._backing_indices[backing_pos] = new_backing
	201	for backing_pos in range(backing_pos):
	202	self._backing_indices[backing_pos] = None
	203	else:
	204	self._backing_indices.append(new_backing)
	205	self._keys = set()
	206	self._nodes = {}
	207	self._nodes_by_key = None
	208
	209	def _spill_mem_keys_without_combining(self):
	210	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	211
	212	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	213	iterators_to_combine = [self._iter_mem_nodes()]
	214	pos = -1
	215	for pos, backing in enumerate(self._backing_indices):
	216	if backing is None:
	217	pos -= 1
	218	break
	219	iterators_to_combine.append(backing.iter_all_entries())
	220	backing_pos = pos + 1
	221	new_backing_file, size = \
	222	self._write_nodes(self._iter_smallest(iterators_to_combine),
	223	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	224	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	225
	226	def add_nodes(self, nodes):
	227	"""Add nodes to the index.
	228
	229	:param nodes: An iterable of (key, node_refs, value) entries to add.
	230	"""
	231	if self.reference_lists:
	232	for (key, value, node_refs) in nodes:
	233	self.add_node(key, value, node_refs)
	234	else:
	235	for (key, value) in nodes:
	236	self.add_node(key, value)
	237
	238	def _iter_mem_nodes(self):
	239	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	240	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	241	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	242	for key in sorted(nodes):
	243	references, value = nodes[key]
	244	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	245	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	246	for key in sorted(nodes):
	247	references, value = nodes[key]
	248	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	249
	250	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	251	if len(iterators_to_combine) == 1:
	252	for value in iterators_to_combine[0]:
	253	yield value
	254	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	255	current_values = []
	256	for iterator in iterators_to_combine:
	257	try:
	258	current_values.append(iterator.next())
	259	except StopIteration:
	260	current_values.append(None)
	261	last = None
	262	while True:
	263	# Decorate candidates with the value to allow 2.4's min to be used.
	264	candidates = [(item[1][1], item) for item
	265	in enumerate(current_values) if item[1] is not None]
	266	if not len(candidates):
	267	return
	268	selected = min(candidates)
	269	# undecorate back to (pos, node)
	270	selected = selected[1]
	271	if last == selected[1][1]:
	272	raise errors.BadIndexDuplicateKey(last, self)
	273	last = selected[1][1]
	274	# Yield, with self as the index
	275	yield (self,) + selected[1][1:]
	276	pos = selected[0]
	277	try:
	278	current_values[pos] = iterators_to_combine[pos].next()
	279	except StopIteration:
	280	current_values[pos] = None
	281
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	282	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	283	"""Add a key to the current chunk.
	284
	285	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	286	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	287	:param allow_optimize: If set to False, prevent setting the optimize
	288	flag when writing out. This is used by the _spill_mem_keys_to_disk
	289	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	290	"""
	291	if rows[-1].writer is None:
	292	# opening a new leaf chunk;
	293	for pos, internal_row in enumerate(rows[:-1]):
	294	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	295	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	296	if internal_row.writer is None:
	297	length = _PAGE_SIZE
	298	if internal_row.nodes == 0:
	299	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	300	if allow_optimize:
	301	optimize_for_size = self._optimize_for_size
	302	else:
	303	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	304	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	305	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	306	internal_row.writer.write(_INTERNAL_FLAG)
	307	internal_row.writer.write(_INTERNAL_OFFSET +
	308	str(rows[pos + 1].nodes) + "\n")
	309	# add a new leaf
	310	length = _PAGE_SIZE
	311	if rows[-1].nodes == 0:
	312	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	313	rows[-1].writer = chunk_writer.ChunkWriter(length,
	314	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	315	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	316	if rows[-1].writer.write(line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	317	# this key did not fit in the node:
	318	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	319	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	320	new_row = True
	321	for row in reversed(rows[:-1]):
	322	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	323	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	324	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	325	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	326	row.finish_node()
	327	else:
	328	# We've found a node that can handle the pointer.
	329	new_row = False
	330	break
	331	# If we reached the current root without being able to mark the
	332	# division point, then we need a new root:
	333	if new_row:
	334	# We need a new row
	335	if 'index' in debug.debug_flags:
	336	trace.mutter('Inserting new global row.')
	337	new_row = _InternalBuilderRow()
	338	reserved_bytes = 0
	339	rows.insert(0, new_row)
	340	# This will be padded, hence the -100
	341	new_row.writer = chunk_writer.ChunkWriter(
	342	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	343	reserved_bytes,
	344	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	345	new_row.writer.write(_INTERNAL_FLAG)
	346	new_row.writer.write(_INTERNAL_OFFSET +
	347	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	348	new_row.writer.write(key_line)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	349	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	350
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	351	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	352	"""Write node_iterator out as a B+Tree.
	353
	354	:param node_iterator: An iterator of sorted nodes. Each node should
	355	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	356	:param allow_optimize: If set to False, prevent setting the optimize
	357	flag when writing out. This is used by the _spill_mem_keys_to_disk
	358	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	359	:return: A file handle for a temporary file containing a B+Tree for
	360	the nodes.
	361	"""
	362	# The index rows - rows[0] is the root, rows[1] is the layer under it
	363	# etc.
	364	rows = []
	365	# forward sorted by key. In future we may consider topological sorting,
	366	# at the cost of table scans for direct lookup, or a second index for
	367	# direct lookup
	368	key_count = 0
	369	# A stack with the number of nodes of each size. 0 is the root node
	370	# and must always be 1 (if there are any nodes in the tree).
	371	self.row_lengths = []
	372	# Loop over all nodes adding them to the bottom row
	373	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	374	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	375	# row above, transitively.
	376	for node in node_iterator:
	377	if key_count == 0:
	378	# First key triggers the first row
	379	rows.append(_LeafBuilderRow())
	380	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	381	string_key, line = _btree_serializer._flatten_node(node,
	382	self.reference_lists)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	383	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	384	for row in reversed(rows):
	385	pad = (type(row) != _LeafBuilderRow)
	386	row.finish_node(pad=pad)
	387	lines = [_BTSIGNATURE]
	388	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	389	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	390	lines.append(_OPTION_LEN + str(key_count) + '\n')
	391	row_lengths = [row.nodes for row in rows]
	392	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	393	if row_lengths and row_lengths[-1] > 1:
	394	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	395	else:
	396	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	397	result.writelines(lines)
	398	position = sum(map(len, lines))
	399	root_row = True
	400	if position > _RESERVED_HEADER_BYTES:
	401	raise AssertionError("Could not fit the header in the"
	402	" reserved space: %d > %d"
	403	% (position, _RESERVED_HEADER_BYTES))
	404	# write the rows out:
	405	for row in rows:
	406	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	407	row.spool.flush()
	408	row.spool.seek(0)
	409	# copy nodes to the finalised file.
	410	# Special case the first node as it may be prefixed
	411	node = row.spool.read(_PAGE_SIZE)
	412	result.write(node[reserved:])
	413	result.write("\x00" * (reserved - position))
	414	position = 0 # Only the root row actually has an offset
	415	copied_len = osutils.pumpfile(row.spool, result)
	416	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	417	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	418	raise AssertionError("Incorrect amount of data copied"
	419	" expected: %d, got: %d"
	420	% ((row.nodes - 1) * _PAGE_SIZE,
	421	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	422	result.flush()
	423	size = result.tell()
	424	result.seek(0)
	425	return result, size
	426
	427	def finish(self):
	428	"""Finalise the index.
	429
	430	:return: A file handle for a temporary file containing the nodes added
	431	to the index.
	432	"""
	433	return self._write_nodes(self.iter_all_entries())[0]
	434
	435	def iter_all_entries(self):
	436	"""Iterate over all keys within the index
	437
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	438	:return: An iterable of (index, key, value, reference_lists). There is
	439	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	440	efficient order for the index (in this case dictionary hash order).
	441	"""
	442	if 'evil' in debug.debug_flags:
	443	trace.mutter_callsite(3,
	444	"iter_all_entries scales with size of history.")
	445	# Doing serial rather than ordered would be faster; but this shouldn't
	446	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	447	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	448	for backing in self._backing_indices:
	449	if backing is not None:
	450	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	451	if len(iterators) == 1:
	452	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	453	return self._iter_smallest(iterators)
	454
	455	def iter_entries(self, keys):
	456	"""Iterate over keys within the index.
	457
	458	:param keys: An iterable providing the keys to be retrieved.
	459	:return: An iterable of (index, key, value, reference_lists). There is no
	460	defined order for the result iteration - it will be in the most
	461	efficient order for the index (keys iteration order in this case).
	462	"""
	463	keys = set(keys)
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	464	local_keys = keys.intersection(self._keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	465	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	466	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	467	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	468	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	469	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	470	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	471	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	472	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	473	# Find things that are in backing indices that have not been handled
	474	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	475	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	476	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	477	# Remove all of the keys that we found locally
	478	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	479	for backing in self._backing_indices:
	480	if backing is None:
	481	continue
	482	if not keys:
	483	return
	484	for node in backing.iter_entries(keys):
	485	keys.remove(node[1])
	486	yield (self,) + node[1:]
	487
	488	def iter_entries_prefix(self, keys):
	489	"""Iterate over keys within the index using prefix matching.
	490
	491	Prefix matching is applied within the tuple of a key, not to within
	492	the bytestring of each key element. e.g. if you have the keys ('foo',
	493	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	494	only the former key is returned.
	495
	496	:param keys: An iterable providing the key prefixes to be retrieved.
	497	Each key prefix takes the form of a tuple the length of a key, but
	498	with the last N elements 'None' rather than a regular bytestring.
	499	The first element cannot be 'None'.
	500	:return: An iterable as per iter_all_entries, but restricted to the
	501	keys with a matching prefix to those supplied. No additional keys
	502	will be returned, and every match that is in the index will be
	503	returned.
	504	"""
	505	# XXX: To much duplication with the GraphIndex class; consider finding
	506	# a good place to pull out the actual common logic.
	507	keys = set(keys)
	508	if not keys:
	509	return
	510	for backing in self._backing_indices:
	511	if backing is None:
	512	continue
	513	for node in backing.iter_entries_prefix(keys):
	514	yield (self,) + node[1:]
	515	if self._key_length == 1:
	516	for key in keys:
	517	# sanity check
	518	if key[0] is None:
	519	raise errors.BadIndexKey(key)
	520	if len(key) != self._key_length:
	521	raise errors.BadIndexKey(key)
	522	try:
	523	node = self._nodes[key]
	524	except KeyError:
	525	continue
	526	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	527	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	528	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	529	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	530	return
	531	for key in keys:
	532	# sanity check
	533	if key[0] is None:
	534	raise errors.BadIndexKey(key)
	535	if len(key) != self._key_length:
	536	raise errors.BadIndexKey(key)
	537	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	538	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	539	elements = list(key)
	540	# find the subdict to return
	541	try:
	542	while len(elements) and elements[0] is not None:
	543	key_dict = key_dict[elements[0]]
	544	elements.pop(0)
	545	except KeyError:
	546	# a non-existant lookup.
	547	continue
	548	if len(elements):
	549	dicts = [key_dict]
	550	while dicts:
	551	key_dict = dicts.pop(-1)
	552	# can't be empty or would not exist
	553	item, value = key_dict.iteritems().next()
	554	if type(value) == dict:
	555	# push keys
	556	dicts.extend(key_dict.itervalues())
	557	else:
	558	# yield keys
	559	for value in key_dict.itervalues():
	560	yield (self, ) + value
	561	else:
	562	yield (self, ) + key_dict
	563
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	564	def _get_nodes_by_key(self):
	565	if self._nodes_by_key is None:
	566	nodes_by_key = {}
	567	if self.reference_lists:
	568	for key, (references, value) in self._nodes.iteritems():
	569	key_dict = nodes_by_key
	570	for subkey in key[:-1]:
	571	key_dict = key_dict.setdefault(subkey, {})
	572	key_dict[key[-1]] = key, value, references
	573	else:
	574	for key, (references, value) in self._nodes.iteritems():
	575	key_dict = nodes_by_key
	576	for subkey in key[:-1]:
	577	key_dict = key_dict.setdefault(subkey, {})
	578	key_dict[key[-1]] = key, value
	579	self._nodes_by_key = nodes_by_key
	580	return self._nodes_by_key
	581
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	582	def key_count(self):
	583	"""Return an estimate of the number of keys in this index.
	584
	585	For InMemoryGraphIndex the estimate is exact.
	586	"""
	587	return len(self._keys) + sum(backing.key_count() for backing in
	588	self._backing_indices if backing is not None)
	589
	590	def validate(self):
	591	"""In memory index's have no known corruption at the moment."""
	592
	593
	594	class _LeafNode(object):
	595	"""A leaf node for a serialised B+Tree index."""
	596
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	597	__slots__ = ('keys', 'min_key', 'max_key')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	598
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	599	def __init__(self, bytes, key_length, ref_list_length):
	600	"""Parse bytes to create a leaf node object."""
	601	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	602	key_list = _btree_serializer._parse_leaf_lines(bytes,
	603	key_length, ref_list_length)
	604	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	605	self.min_key = key_list[0][0]
	606	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	607	else:
	608	self.min_key = self.max_key = None
	609	self.keys = dict(key_list)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	610
	611
	612	class _InternalNode(object):
	613	"""An internal node for a serialised B+Tree index."""
	614
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	615	__slots__ = ('keys', 'offset')
	616
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	617	def __init__(self, bytes):
	618	"""Parse bytes to create an internal node object."""
	619	# splitlines mangles the \r delimiters.. don't use it.
	620	self.keys = self._parse_lines(bytes.split('\n'))
	621
	622	def _parse_lines(self, lines):
	623	nodes = []
	624	self.offset = int(lines[1][7:])
	625	for line in lines[2:]:
	626	if line == '':
	627	break
4075.3.1 by John Arbash Meinel Use PyString_InternInPlace to intern() the various parts of keys that are processed.	628	nodes.append(tuple(map(intern, line.split('\0'))))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	629	return nodes
	630
	631
	632	class BTreeGraphIndex(object):
	633	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	634
	635	Individual nodes are held in a LRU cache. This holds the root node in
	636	memory except when very large walks are done.
	637	"""
	638
	639	def __init__(self, transport, name, size):
	640	"""Create a B+Tree index object on the index name.
	641
	642	:param transport: The transport to read data for the index from.
	643	:param name: The file name of the index on transport.
	644	:param size: Optional size of the index in bytes. This allows
	645	compatibility with the GraphIndex API, as well as ensuring that
	646	the initial read (to read the root node header) can be done
	647	without over-reading even on empty indices, and on small indices
	648	allows single-IO to read the entire index.
	649	"""
	650	self._transport = transport
	651	self._name = name
	652	self._size = size
	653	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	654	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	655	self._root_node = None
	656	# Default max size is 100,000 leave values
	657	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
	658	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
4208.1.1 by John Arbash Meinel Use a simple dict cache for btree internal_node lookups.	659	# We could limit this, but even a 300k record btree has only 3k leaf
	660	# nodes, and only 20 internal nodes. So the default of 100 nodes in an
	661	# LRU would mean we always cache everything anyway, no need to pay the
	662	# overhead of LRU
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	663	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	664	self._key_count = None
	665	self._row_lengths = None
	666	self._row_offsets = None # Start of each row, [-1] is the end
	667
	668	def __eq__(self, other):
	669	"""Equal when self and other were created with the same parameters."""
	670	return (
	671	type(self) == type(other) and
	672	self._transport == other._transport and
	673	self._name == other._name and
	674	self._size == other._size)
	675
	676	def __ne__(self, other):
	677	return not self.__eq__(other)
	678
3763.8.12 by John Arbash Meinel Code cleanup.	679	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	680	"""Read nodes and cache them in the lru.
	681
	682	The nodes list supplied is sorted and then read from disk, each node
	683	being inserted it into the _node_cache.
	684
	685	Note: Asking for more nodes than the _node_cache can contain will
	686	result in some of the results being immediately discarded, to prevent
	687	this an assertion is raised if more nodes are asked for than are
	688	cachable.
	689
	690	:return: A dict of {node_pos: node}
	691	"""
	692	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	693	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	694	for node_pos, node in self._read_nodes(sorted(nodes)):
	695	if node_pos == 0: # Special case
	696	self._root_node = node
	697	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	698	if start_of_leaves is None:
	699	start_of_leaves = self._row_offsets[-2]
	700	if node_pos < start_of_leaves:
	701	self._internal_node_cache.add(node_pos, node)
	702	else:
	703	self._leaf_node_cache.add(node_pos, node)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	704	found[node_pos] = node
	705	return found
	706
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	707	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	708	"""Convert transport's recommended_page_size into btree pages.
	709
	710	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	711	pages fit in that length.
	712	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	713	recommended_read = self._transport.recommended_page_size()
	714	recommended_pages = int(math.ceil(recommended_read /
	715	float(_PAGE_SIZE)))
	716	return recommended_pages
	717
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	718	def _compute_total_pages_in_index(self):
	719	"""How many pages are in the index.
	720
	721	If we have read the header we will use the value stored there.
	722	Otherwise it will be computed based on the length of the index.
	723	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	724	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	725	raise AssertionError('_compute_total_pages_in_index should not be'
	726	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	727	if self._root_node is not None:
	728	# This is the number of pages as defined by the header
	729	return self._row_offsets[-1]
	730	# This is the number of pages as defined by the size of the index. They
	731	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	732	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	733	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	734
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	735	def _expand_offsets(self, offsets):
	736	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	737
	738	The idea is that we always want to make big-enough requests (like 64kB
	739	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	740	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	741	out what other pages we might want to read.
	742
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	743	See also doc/developers/btree_index_prefetch.txt for more details.
	744
	745	:param offsets: The offsets to be read
	746	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	747	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	748	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	749	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	750
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	751	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	752	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	753	if 'index' in debug.debug_flags:
	754	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	755	len(offsets), self._recommended_pages)
	756	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	757	if self._size is None:
	758	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	759	if 'index' in debug.debug_flags:
	760	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	761	return offsets
	762	total_pages = self._compute_total_pages_in_index()
	763	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	764	# If reading recommended_pages would read the rest of the index, just
	765	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	766	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	767	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	768	if cached_offsets:
	769	expanded = [x for x in xrange(total_pages)
	770	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	771	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	772	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	773	if 'index' in debug.debug_flags:
	774	trace.mutter(' reading all unread pages: %s', expanded)
	775	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	776
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	777	if self._root_node is None:
	778	# ATM on the first read of the root node of a large index, we don't
	779	# bother pre-reading any other pages. This is because the
	780	# likelyhood of actually reading interesting pages is very low.
	781	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	782	# a possible implementation when we are guessing that the second
	783	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	784	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	785	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	786	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	787	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	788	# We haven't read enough to justify expansion
	789	# If we are only going to read the root node, and 1 leaf node,
	790	# then it isn't worth expanding our request. Once we've read at
	791	# least 2 nodes, then we are probably doing a search, and we
	792	# start expanding our requests.
	793	if 'index' in debug.debug_flags:
	794	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	795	return offsets
	796	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	797	total_pages)
	798
	799	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	800	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	801	trace.mutter('expanded: %s', final_offsets)
	802	return final_offsets
	803
	804	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	805	"""Expand requests to neighbors until we have enough pages.
	806
	807	This is called from _expand_offsets after policy has determined that we
	808	want to expand.
	809	We only want to expand requests within a given layer. We cheat a little
	810	bit and assume all requests will be in the same layer. This is true
	811	given the current design, but if it changes this algorithm may perform
	812	oddly.
	813
	814	:param offsets: requested offsets
	815	:param cached_offsets: offsets for pages we currently have cached
	816	:return: A set() of offsets after expansion
	817	"""
	818	final_offsets = set(offsets)
	819	first = end = None
	820	new_tips = set(final_offsets)
	821	while len(final_offsets) < self._recommended_pages and new_tips:
	822	next_tips = set()
	823	for pos in new_tips:
	824	if first is None:
	825	first, end = self._find_layer_first_and_end(pos)
	826	previous = pos - 1
	827	if (previous > 0
	828	and previous not in cached_offsets
	829	and previous not in final_offsets
	830	and previous >= first):
	831	next_tips.add(previous)
	832	after = pos + 1
	833	if (after < total_pages
	834	and after not in cached_offsets
	835	and after not in final_offsets
	836	and after < end):
	837	next_tips.add(after)
	838	# This would keep us from going bigger than
	839	# recommended_pages by only expanding the first offsets.
	840	# However, if we are making a 'wide' request, it is
	841	# reasonable to expand all points equally.
	842	# if len(final_offsets) > recommended_pages:
	843	# break
	844	final_offsets.update(next_tips)
	845	new_tips = next_tips
	846	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	847
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	848	def external_references(self, ref_list_num):
	849	if self._root_node is None:
	850	self._get_root_node()
	851	if ref_list_num + 1 > self.node_ref_lists:
	852	raise ValueError('No ref list %d, index has %d ref lists'
	853	% (ref_list_num, self.node_ref_lists))
	854	keys = set()
	855	refs = set()
	856	for node in self.iter_all_entries():
	857	keys.add(node[1])
	858	refs.update(node[3][ref_list_num])
	859	return refs - keys
	860
3763.8.12 by John Arbash Meinel Code cleanup.	861	def _find_layer_first_and_end(self, offset):
	862	"""Find the start/stop nodes for the layer corresponding to offset.
	863
	864	:return: (first, end)
	865	first is the first node in this layer
	866	end is the first node of the next layer
	867	"""
	868	first = end = 0
	869	for roffset in self._row_offsets:
	870	first = end
	871	end = roffset
	872	if offset < roffset:
	873	break
	874	return first, end
	875
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	876	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	877	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	878	cached_offsets = set(self._internal_node_cache.keys())
	879	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	880	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	881	cached_offsets.add(0)
	882	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	883
	884	def _get_root_node(self):
	885	if self._root_node is None:
	886	# We may not have a root node yet
	887	self._get_internal_nodes([0])
	888	return self._root_node
	889
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	890	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	891	found = {}
	892	needed = []
	893	for idx in node_indexes:
	894	if idx == 0 and self._root_node is not None:
	895	found[0] = self._root_node
	896	continue
	897	try:
	898	found[idx] = cache[idx]
	899	except KeyError:
	900	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	901	if not needed:
	902	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	903	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	904	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	905	return found
	906
	907	def _get_internal_nodes(self, node_indexes):
	908	"""Get a node, from cache or disk.
	909
	910	After getting it, the node will be cached.
	911	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	912	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	913
3805.4.6 by John Arbash Meinel refactor for clarity.	914	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	915	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	916	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	917	for node in nodes.itervalues():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	918	for key, value in node.keys.iteritems():
	919	if key in self._leaf_value_cache:
	920	# Don't add the rest of the keys, we've seen this node
	921	# before.
	922	break
	923	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	924
	925	def _get_leaf_nodes(self, node_indexes):
	926	"""Get a bunch of nodes, from cache or disk."""
	927	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	928	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	929	return found
	930
	931	def iter_all_entries(self):
	932	"""Iterate over all keys within the index.
	933
	934	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	935	The former tuple is used when there are no reference lists in the
	936	index, making the API compatible with simple key:value index types.
	937	There is no defined order for the result iteration - it will be in
	938	the most efficient order for the index.
	939	"""
	940	if 'evil' in debug.debug_flags:
	941	trace.mutter_callsite(3,
	942	"iter_all_entries scales with size of history.")
	943	if not self.key_count():
	944	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	945	if self._row_offsets[-1] == 1:
	946	# There is only the root node, and we read that via key_count()
	947	if self.node_ref_lists:
	948	for key, (value, refs) in sorted(self._root_node.keys.items()):
	949	yield (self, key, value, refs)
	950	else:
	951	for key, (value, refs) in sorted(self._root_node.keys.items()):
	952	yield (self, key, value)
	953	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	954	start_of_leaves = self._row_offsets[-2]
	955	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	956	needed_offsets = range(start_of_leaves, end_of_leaves)
	957	if needed_offsets == [0]:
	958	# Special case when we only have a root node, as we have already
	959	# read everything
	960	nodes = [(0, self._root_node)]
	961	else:
	962	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	963	# We iterate strictly in-order so that we can use this function
	964	# for spilling index builds to disk.
	965	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	966	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	967	for key, (value, refs) in sorted(node.keys.items()):
	968	yield (self, key, value, refs)
	969	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	970	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	971	for key, (value, refs) in sorted(node.keys.items()):
	972	yield (self, key, value)
	973
	974	@staticmethod
	975	def _multi_bisect_right(in_keys, fixed_keys):
	976	"""Find the positions where each 'in_key' would fit in fixed_keys.
	977
	978	This is equivalent to doing "bisect_right" on each in_key into
	979	fixed_keys
	980
	981	:param in_keys: A sorted list of keys to match with fixed_keys
	982	:param fixed_keys: A sorted list of keys to match against
	983	:return: A list of (integer position, [key list]) tuples.
	984	"""
	985	if not in_keys:
	986	return []
	987	if not fixed_keys:
	988	# no pointers in the fixed_keys list, which means everything must
	989	# fall to the left.
	990	return [(0, in_keys)]
	991
	992	# TODO: Iterating both lists will generally take M + N steps
	993	# Bisecting each key will generally take M * log2 N steps.
	994	# If we had an efficient way to compare, we could pick the method
	995	# based on which has the fewer number of steps.
	996	# There is also the argument that bisect_right is a compiled
	997	# function, so there is even more to be gained.
	998	# iter_steps = len(in_keys) + len(fixed_keys)
	999	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1000	if len(in_keys) == 1: # Bisect will always be faster for M = 1
	1001	return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
	1002	# elif bisect_steps < iter_steps:
	1003	# offsets = {}
	1004	# for key in in_keys:
	1005	# offsets.setdefault(bisect_right(fixed_keys, key),
	1006	# []).append(key)
	1007	# return [(o, offsets[o]) for o in sorted(offsets)]
	1008	in_keys_iter = iter(in_keys)
	1009	fixed_keys_iter = enumerate(fixed_keys)
	1010	cur_in_key = in_keys_iter.next()
	1011	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1012
	1013	class InputDone(Exception): pass
	1014	class FixedDone(Exception): pass
	1015
	1016	output = []
	1017	cur_out = []
	1018
	1019	# TODO: Another possibility is that rather than iterating on each side,
	1020	# we could use a combination of bisecting and iterating. For
	1021	# example, while cur_in_key < fixed_key, bisect to find its
	1022	# point, then iterate all matching keys, then bisect (restricted
	1023	# to only the remainder) for the next one, etc.
	1024	try:
	1025	while True:
	1026	if cur_in_key < cur_fixed_key:
	1027	cur_keys = []
	1028	cur_out = (cur_fixed_offset, cur_keys)
	1029	output.append(cur_out)
	1030	while cur_in_key < cur_fixed_key:
	1031	cur_keys.append(cur_in_key)
	1032	try:
	1033	cur_in_key = in_keys_iter.next()
	1034	except StopIteration:
1035	raise InputDone
1036	# At this point cur_in_key must be >= cur_fixed_key
1037	# step the cur_fixed_key until we pass the cur key, or walk off
1038	# the end
1039	while cur_in_key >= cur_fixed_key:
1040	try:
1041	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1042	except StopIteration:
1043	raise FixedDone
1044	except InputDone:
1045	# We consumed all of the input, nothing more to do
1046	pass
1047	except FixedDone:
1048	# There was some input left, but we consumed all of fixed, so we
1049	# have to add one more for the tail
1050	cur_keys = [cur_in_key]
1051	cur_keys.extend(in_keys_iter)
1052	cur_out = (len(fixed_keys), cur_keys)
1053	output.append(cur_out)
1054	return output
1055
4593.4.5 by John Arbash Meinel Start adding some tests.	1056	def _walk_through_internal_nodes(self, keys):
	1057	"""Take the given set of keys, and find the corresponding LeafNodes.
	1058
	1059	:param keys: An unsorted iterable of keys to search for
	1060	:return: (nodes, index_and_keys)
	1061	nodes is a dict mapping {index: LeafNode}
	1062	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1063	"""
	1064	# 6 seconds spent in miss_torture using the sorted() line.
	1065	# Even with out of order disk IO it seems faster not to sort it when
	1066	# large queries are being made.
	1067	keys_at_index = [(0, sorted(keys))]
	1068
	1069	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1070	node_indexes = [idx for idx, s_keys in keys_at_index]
	1071	nodes = self._get_internal_nodes(node_indexes)
	1072
	1073	next_nodes_and_keys = []
	1074	for node_index, sub_keys in keys_at_index:
	1075	node = nodes[node_index]
	1076	positions = self._multi_bisect_right(sub_keys, node.keys)
	1077	node_offset = next_row_start + node.offset
	1078	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1079	for pos, s_keys in positions])
	1080	keys_at_index = next_nodes_and_keys
	1081	# We should now be at the _LeafNodes
	1082	node_indexes = [idx for idx, s_keys in keys_at_index]
	1083
	1084	# TODO: We may not want to always read all the nodes in one
	1085	# big go. Consider setting a max size on this.
	1086	nodes = self._get_leaf_nodes(node_indexes)
	1087	return nodes, keys_at_index
	1088
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1089	def iter_entries(self, keys):
	1090	"""Iterate over keys within the index.
	1091
	1092	:param keys: An iterable providing the keys to be retrieved.
	1093	:return: An iterable as per iter_all_entries, but restricted to the
	1094	keys supplied. No additional keys will be returned, and every
	1095	key supplied that is in the index will be returned.
	1096	"""
	1097	# 6 seconds spent in miss_torture using the sorted() line.
	1098	# Even with out of order disk IO it seems faster not to sort it when
	1099	# large queries are being made.
	1100	# However, now that we are doing multi-way bisecting, we need the keys
	1101	# in sorted order anyway. We could change the multi-way code to not
	1102	# require sorted order. (For example, it bisects for the first node,
	1103	# does an in-order search until a key comes before the current point,
	1104	# which it then bisects for, etc.)
	1105	keys = frozenset(keys)
	1106	if not keys:
	1107	return
	1108
	1109	if not self.key_count():
	1110	return
	1111
	1112	needed_keys = []
	1113	if self._leaf_value_cache is None:
	1114	needed_keys = keys
	1115	else:
	1116	for key in keys:
	1117	value = self._leaf_value_cache.get(key, None)
	1118	if value is not None:
	1119	# This key is known not to be here, skip it
	1120	value, refs = value
	1121	if self.node_ref_lists:
	1122	yield (self, key, value, refs)
	1123	else:
	1124	yield (self, key, value)
	1125	else:
	1126	needed_keys.append(key)
	1127
	1128	last_key = None
	1129	needed_keys = keys
	1130	if not needed_keys:
	1131	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1132	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1133	for node_index, sub_keys in nodes_and_keys:
	1134	if not sub_keys:
	1135	continue
	1136	node = nodes[node_index]
	1137	for next_sub_key in sub_keys:
	1138	if next_sub_key in node.keys:
	1139	value, refs = node.keys[next_sub_key]
	1140	if self.node_ref_lists:
	1141	yield (self, next_sub_key, value, refs)
	1142	else:
	1143	yield (self, next_sub_key, value)
	1144
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1145	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1146	"""Find the parent_map information for the set of keys.
	1147
	1148	This populates the parent_map dict and missing_keys set based on the
	1149	queried keys. It also can fill out an arbitrary number of parents that
	1150	it finds while searching for the supplied keys.
	1151
	1152	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1153	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1154
	1155	:param keys: A keys whose ancestry we want to return
	1156	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1157	:param ref_list_num: This index in the ref_lists is the parents we
	1158	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1159	:param parent_map: {key: parent_keys} for keys that are present in this
	1160	index. This may contain more entries than were in 'keys', that are
	1161	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1162	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1163	This may include parents that were not directly requested, but we
	1164	were able to determine that they are not present in this index.
	1165	:return: search_keys parents that were found but not queried to know
	1166	if they are missing or present. Callers can re-query this index for
	1167	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1168	"""
	1169	if not self.key_count():
	1170	# We use key_count() to trigger reading the root node and
	1171	# determining info about this BTreeGraphIndex
	1172	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1173	missing_keys.update(keys)
	1174	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1175	if ref_list_num >= self.node_ref_lists:
	1176	raise ValueError('No ref list %d, index has %d ref lists'
	1177	% (ref_list_num, self.node_ref_lists))
	1178
	1179	# The main trick we are trying to accomplish is that when we find a
	1180	# key listing its parents, we expect that the parent key is also likely
	1181	# to sit on the same page. Allowing us to expand parents quickly
	1182	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1183	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1184
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1185	# These are parent keys which could not be immediately resolved on the
	1186	# page where the child was present. Note that we may already be
	1187	# searching for that key, and it may actually be present [or known
	1188	# missing] on one of the other pages we are reading.
	1189	# TODO:
	1190	# We could try searching for them in the immediate previous or next
	1191	# page. If they occur "later" we could put them in a pending lookup
	1192	# set, and then for each node we read thereafter we could check to
	1193	# see if they are present.
	1194	# However, we don't know the impact of keeping this list of things
	1195	# that I'm going to search for every node I come across from here on
	1196	# out.
	1197	# It doesn't handle the case when the parent key is missing on a
	1198	# page that we don't read. So we already have to handle being
	1199	# re-entrant for that.
	1200	# Since most keys contain a date string, they are more likely to be
	1201	# found earlier in the file than later, but we would know that right
	1202	# away (key < min_key), and wouldn't keep searching it on every other
	1203	# page that we read.
	1204	# Mostly, it is an idea, one which should be benchmarked.
	1205	parents_not_on_page = set()
	1206
	1207	for node_index, sub_keys in nodes_and_keys:
	1208	if not sub_keys:
	1209	continue
	1210	# sub_keys is all of the keys we are looking for that should exist
	1211	# on this page, if they aren't here, then they won't be found
	1212	node = nodes[node_index]
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1213	node_keys = node.keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1214	parents_to_check = set()
	1215	for next_sub_key in sub_keys:
4593.4.5 by John Arbash Meinel Start adding some tests.	1216	if next_sub_key not in node_keys:
	1217	# This one is just not present in the index at all
	1218	missing_keys.add(next_sub_key)
	1219	else:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1220	value, refs = node_keys[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1221	parent_keys = refs[ref_list_num]
	1222	parent_map[next_sub_key] = parent_keys
	1223	parents_to_check.update(parent_keys)
	1224	# Don't look for things we've already found
	1225	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1226	# this can be used to test the benefit of having the check loop
	1227	# inlined.
	1228	# parents_not_on_page.update(parents_to_check)
	1229	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1230	while parents_to_check:
	1231	next_parents_to_check = set()
	1232	for key in parents_to_check:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1233	if key in node_keys:
	1234	value, refs = node_keys[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1235	parent_keys = refs[ref_list_num]
	1236	parent_map[key] = parent_keys
	1237	next_parents_to_check.update(parent_keys)
	1238	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1239	# This parent either is genuinely missing, or should be
	1240	# found on another page. Perf test whether it is better
	1241	# to check if this node should fit on this page or not.
	1242	# in the 'everything-in-one-pack' scenario, this not
	1243	# doing the check is 237ms vs 243ms.
	1244	# So slightly better, but I assume the standard 'lots
	1245	# of packs' is going to show a reasonable improvement
	1246	# from the check, because it avoids 'going around
	1247	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1248	# parents_not_on_page.add(key)
	1249	# Missing for some reason
	1250	if key < node.min_key:
	1251	# in the case of bzr.dev, 3.4k/5.3k misses are
	1252	# 'earlier' misses (65%)
	1253	parents_not_on_page.add(key)
	1254	elif key > node.max_key:
	1255	# This parent key would be present on a different
	1256	# LeafNode
	1257	parents_not_on_page.add(key)
	1258	else:
	1259	# assert key != node.min_key and key != node.max_key
	1260	# If it was going to be present, it would be on
	1261	# this page, so mark it missing.
	1262	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1263	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1264	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1265	# parents_not_on_page could have been found on a different page, or be
	1266	# known to be missing. So cull out everything that has already been
	1267	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1268	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1269	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1270	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1271
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1272	def iter_entries_prefix(self, keys):
	1273	"""Iterate over keys within the index using prefix matching.
	1274
	1275	Prefix matching is applied within the tuple of a key, not to within
	1276	the bytestring of each key element. e.g. if you have the keys ('foo',
	1277	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1278	only the former key is returned.
	1279
	1280	WARNING: Note that this method currently causes a full index parse
	1281	unconditionally (which is reasonably appropriate as it is a means for
	1282	thunking many small indices into one larger one and still supplies
	1283	iter_all_entries at the thunk layer).
	1284
	1285	:param keys: An iterable providing the key prefixes to be retrieved.
	1286	Each key prefix takes the form of a tuple the length of a key, but
	1287	with the last N elements 'None' rather than a regular bytestring.
	1288	The first element cannot be 'None'.
	1289	:return: An iterable as per iter_all_entries, but restricted to the
	1290	keys with a matching prefix to those supplied. No additional keys
	1291	will be returned, and every match that is in the index will be
	1292	returned.
	1293	"""
	1294	keys = sorted(set(keys))
	1295	if not keys:
	1296	return
	1297	# Load if needed to check key lengths
	1298	if self._key_count is None:
	1299	self._get_root_node()
	1300	# TODO: only access nodes that can satisfy the prefixes we are looking
	1301	# for. For now, to meet API usage (as this function is not used by
	1302	# current bzrlib) just suck the entire index and iterate in memory.
	1303	nodes = {}
	1304	if self.node_ref_lists:
	1305	if self._key_length == 1:
	1306	for _1, key, value, refs in self.iter_all_entries():
	1307	nodes[key] = value, refs
	1308	else:
	1309	nodes_by_key = {}
	1310	for _1, key, value, refs in self.iter_all_entries():
	1311	key_value = key, value, refs
	1312	# For a key of (foo, bar, baz) create
	1313	# _nodes_by_key[foo][bar][baz] = key_value
	1314	key_dict = nodes_by_key
	1315	for subkey in key[:-1]:
	1316	key_dict = key_dict.setdefault(subkey, {})
	1317	key_dict[key[-1]] = key_value
	1318	else:
	1319	if self._key_length == 1:
	1320	for _1, key, value in self.iter_all_entries():
	1321	nodes[key] = value
	1322	else:
	1323	nodes_by_key = {}
	1324	for _1, key, value in self.iter_all_entries():
	1325	key_value = key, value
	1326	# For a key of (foo, bar, baz) create
	1327	# _nodes_by_key[foo][bar][baz] = key_value
	1328	key_dict = nodes_by_key
	1329	for subkey in key[:-1]:
	1330	key_dict = key_dict.setdefault(subkey, {})
	1331	key_dict[key[-1]] = key_value
	1332	if self._key_length == 1:
	1333	for key in keys:
	1334	# sanity check
	1335	if key[0] is None:
1336	raise errors.BadIndexKey(key)
1337	if len(key) != self._key_length:
1338	raise errors.BadIndexKey(key)
1339	try:
1340	if self.node_ref_lists:
1341	value, node_refs = nodes[key]
1342	yield self, key, value, node_refs
1343	else:
1344	yield self, key, nodes[key]
1345	except KeyError:
1346	pass
1347	return
1348	for key in keys:
1349	# sanity check
1350	if key[0] is None:
1351	raise errors.BadIndexKey(key)
1352	if len(key) != self._key_length:
1353	raise errors.BadIndexKey(key)
1354	# find what it refers to:
1355	key_dict = nodes_by_key
1356	elements = list(key)
1357	# find the subdict whose contents should be returned.
1358	try:
1359	while len(elements) and elements[0] is not None:
1360	key_dict = key_dict[elements[0]]
1361	elements.pop(0)
1362	except KeyError:
1363	# a non-existant lookup.
1364	continue
1365	if len(elements):
1366	dicts = [key_dict]
1367	while dicts:
1368	key_dict = dicts.pop(-1)
1369	# can't be empty or would not exist
1370	item, value = key_dict.iteritems().next()
1371	if type(value) == dict:
1372	# push keys
1373	dicts.extend(key_dict.itervalues())
1374	else:
1375	# yield keys
1376	for value in key_dict.itervalues():
1377	# each value is the key:value:node refs tuple
1378	# ready to yield.
1379	yield (self, ) + value
1380	else:
1381	# the last thing looked up was a terminal element
1382	yield (self, ) + key_dict
1383
1384	def key_count(self):
1385	"""Return an estimate of the number of keys in this index.
1386
1387	For BTreeGraphIndex the estimate is exact as it is contained in the
1388	header.
1389	"""
1390	if self._key_count is None:
1391	self._get_root_node()
1392	return self._key_count
1393
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1394	def _compute_row_offsets(self):
	1395	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1396	offsets = []
	1397	row_offset = 0
	1398	for row in self._row_lengths:
	1399	offsets.append(row_offset)
	1400	row_offset += row
	1401	offsets.append(row_offset)
	1402	self._row_offsets = offsets
	1403
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1404	def _parse_header_from_bytes(self, bytes):
	1405	"""Parse the header from a region of bytes.
	1406
	1407	:param bytes: The data to parse.
	1408	:return: An offset, data tuple such as readv yields, for the unparsed
	1409	data. (which may be of length 0).
	1410	"""
	1411	signature = bytes[0:len(self._signature())]
	1412	if not signature == self._signature():
	1413	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1414	lines = bytes[len(self._signature()):].splitlines()
	1415	options_line = lines[0]
	1416	if not options_line.startswith(_OPTION_NODE_REFS):
	1417	raise errors.BadIndexOptions(self)
	1418	try:
	1419	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1420	except ValueError:
	1421	raise errors.BadIndexOptions(self)
	1422	options_line = lines[1]
	1423	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1424	raise errors.BadIndexOptions(self)
	1425	try:
	1426	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1427	except ValueError:
	1428	raise errors.BadIndexOptions(self)
	1429	options_line = lines[2]
	1430	if not options_line.startswith(_OPTION_LEN):
	1431	raise errors.BadIndexOptions(self)
	1432	try:
	1433	self._key_count = int(options_line[len(_OPTION_LEN):])
	1434	except ValueError:
	1435	raise errors.BadIndexOptions(self)
	1436	options_line = lines[3]
	1437	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1438	raise errors.BadIndexOptions(self)
	1439	try:
	1440	self._row_lengths = map(int, [length for length in
	1441	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1442	if len(length)])
	1443	except ValueError:
	1444	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1445	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1446
	1447	# calculate the bytes we have processed
	1448	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1449	return header_end, bytes[header_end:]
	1450
	1451	def _read_nodes(self, nodes):
	1452	"""Read some nodes from disk into the LRU cache.
	1453
	1454	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1455	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1456	supplied order. If possible doing sort() on the list before requesting
	1457	a read may improve performance.
	1458
	1459	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1460	:return: None
	1461	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1462	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1463	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1464	# list of (offset, length) regions of the file that should, evenually
	1465	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1466	ranges = []
	1467	for index in nodes:
	1468	offset = index * _PAGE_SIZE
	1469	size = _PAGE_SIZE
	1470	if index == 0:
	1471	# Root node - special case
	1472	if self._size:
	1473	size = min(_PAGE_SIZE, self._size)
	1474	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1475	# The only case where we don't know the size, is for very
	1476	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1477	bytes = self._transport.get_bytes(self._name)
	1478	self._size = len(bytes)
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1479	# the whole thing should be parsed out of 'bytes'
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1480	ranges.append((0, len(bytes)))
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1481	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1482	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1483	if offset > self._size:
	1484	raise AssertionError('tried to read past the end'
	1485	' of the file %s > %s'
	1486	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1487	size = min(size, self._size - offset)
	1488	ranges.append((offset, size))
	1489	if not ranges:
	1490	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1491	elif bytes is not None:
	1492	# already have the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1493	data_ranges = [(start, bytes[start:start+_PAGE_SIZE])
	1494	for start in xrange(0, len(bytes), _PAGE_SIZE)]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1495	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1496	data_ranges = self._transport.readv(self._name, ranges)
	1497	else:
	1498	data_ranges = []
	1499	for offset, size in ranges:
	1500	self._file.seek(offset)
	1501	data_ranges.append((offset, self._file.read(size)))
	1502	for offset, data in data_ranges:
	1503	if offset == 0:
	1504	# extract the header
	1505	offset, data = self._parse_header_from_bytes(data)
	1506	if len(data) == 0:
	1507	continue
	1508	bytes = zlib.decompress(data)
	1509	if bytes.startswith(_LEAF_FLAG):
	1510	node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
	1511	elif bytes.startswith(_INTERNAL_FLAG):
	1512	node = _InternalNode(bytes)
	1513	else:
	1514	raise AssertionError("Unknown node type for %r" % bytes)
	1515	yield offset / _PAGE_SIZE, node
	1516
	1517	def _signature(self):
	1518	"""The file signature for this index type."""
	1519	return _BTSIGNATURE
	1520
	1521	def validate(self):
	1522	"""Validate that everything in the index can be accessed."""
	1523	# just read and parse every node.
	1524	self._get_root_node()
	1525	if len(self._row_lengths) > 1:
	1526	start_node = self._row_offsets[1]
	1527	else:
	1528	# We shouldn't be reading anything anyway
	1529	start_node = 1
	1530	node_end = self._row_offsets[-1]
	1531	for node in self._read_nodes(range(start_node, node_end)):
	1532	pass
	1533
	1534
	1535	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1536	from bzrlib import _btree_serializer_pyx as _btree_serializer
4574.3.6 by Martin Pool More warnings when failing to load extensions	1537	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1538	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1539	from bzrlib import _btree_serializer_py as _btree_serializer