~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 6374.1.3)

5752.3.8 by John Arbash Meinel Merge bzr.dev 5764 to resolve release-notes (aka NEWS) conflicts	1	# Copyright (C) 2008-2011 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	21
	22	from bzrlib.lazy_import import lazy_import
	23	lazy_import(globals(), """
5753.2.4 by Jelmer Vernooij Review feedback from John.	24	import bisect
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	25	import math
	26	import tempfile
	27	import zlib
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	28	""")
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	29
	30	from bzrlib import (
	31	chunk_writer,
	32	debug,
	33	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	34	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	35	index,
	36	lru_cache,
	37	osutils,
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	38	static_tuple,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	39	trace,
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	40	transport,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	)
	42	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	43
	44
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	45	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	46	_OPTION_ROW_LENGTHS = "row_lengths="
	47	_LEAF_FLAG = "type=leaf\n"
	48	_INTERNAL_FLAG = "type=internal\n"
	49	_INTERNAL_OFFSET = "offset="
	50
	51	_RESERVED_HEADER_BYTES = 120
	52	_PAGE_SIZE = 4096
	53
	54	# 4K per page: 4MB - 1000 entries
	55	_NODE_CACHE_SIZE = 1000
	56
	57
	58	class _BuilderRow(object):
	59	"""The stored state accumulated while writing out a row in the index.
	60
	61	:ivar spool: A temporary file used to accumulate nodes for this row
	62	in the tree.
	63	:ivar nodes: The count of nodes emitted so far.
	64	"""
	65
	66	def __init__(self):
	67	"""Create a _BuilderRow."""
	68	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	69	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	70	self.writer = None
	71
	72	def finish_node(self, pad=True):
	73	byte_lines, _, padding = self.writer.finish()
	74	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	75	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	76	# padded note:
	77	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	78	elif self.nodes == 1:
	79	# We got bigger than 1 node, switch to a temp file
	80	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	81	spool.write(self.spool.getvalue())
	82	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	83	skipped_bytes = 0
	84	if not pad and padding:
	85	del byte_lines[-1]
	86	skipped_bytes = padding
	87	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	88	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	89	if remainder != 0:
	90	raise AssertionError("incorrect node length: %d, %d"
	91	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	92	self.nodes += 1
	93	self.writer = None
	94
	95
	96	class _InternalBuilderRow(_BuilderRow):
	97	"""The stored state accumulated while writing out internal rows."""
	98
	99	def finish_node(self, pad=True):
	100	if not pad:
	101	raise AssertionError("Must pad internal nodes only.")
	102	_BuilderRow.finish_node(self)
	103
	104
	105	class _LeafBuilderRow(_BuilderRow):
	106	"""The stored state accumulated while writing out a leaf rows."""
	107
	108
	109	class BTreeBuilder(index.GraphIndexBuilder):
	110	"""A Builder for B+Tree based Graph indices.
	111
	112	The resulting graph has the structure:
	113
	114	_SIGNATURE OPTIONS NODES
	115	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	116	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	117	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	118	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	119	LENGTH := 'len=' DIGITS NEWLINE
	120	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	121	NODES := NODE_COMPRESSED*
	122	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	123	NODE_RAW := INTERNAL \| LEAF
	124	INTERNAL := INTERNAL_FLAG POINTERS
	125	LEAF := LEAF_FLAG ROWS
	126	KEY_ELEMENT := Not-whitespace-utf8
	127	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	128	ROWS := ROW*
	129	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	130	ABSENT := 'a'
	131	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	132	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	133	REFERENCE := KEY
	134	VALUE := no-newline-no-null-bytes
	135	"""
	136
	137	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	138	"""See GraphIndexBuilder.__init__.
	139
	140	:param spill_at: Optional parameter controlling the maximum number
	141	of nodes that BTreeBuilder will hold in memory.
	142	"""
	143	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	144	key_elements=key_elements)
	145	self._spill_at = spill_at
	146	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	147	# A map of {key: (node_refs, value)}
	148	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	149	# Indicate it hasn't been built yet
	150	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	151	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	152
	153	def add_node(self, key, value, references=()):
	154	"""Add a node to the index.
	155
	156	If adding the node causes the builder to reach its spill_at threshold,
	157	disk spilling will be triggered.
	158
	159	:param key: The key. keys are non-empty tuples containing
	160	as many whitespace-free utf8 bytestrings as the key length
	161	defined for this index.
	162	:param references: An iterable of iterables of keys. Each is a
	163	reference to another key.
	164	:param value: The value to associate with the key. It may be any
5891.1.3 by Andrew Bennetts Move docstring formatting fixes.	165	bytes as long as it does not contain \\0 or \\n.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	166	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	167	# Ensure that 'key' is a StaticTuple
	168	key = static_tuple.StaticTuple.from_sequence(key).intern()
3644.2.9 by John Arbash Meinel Refactor some code.	169	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	170	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	171	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	172	raise errors.BadIndexDuplicateKey(key, self)
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	173	self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
3644.2.9 by John Arbash Meinel Refactor some code.	174	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	175	self._update_nodes_by_key(key, value, node_refs)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	176	if len(self._nodes) < self._spill_at:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	177	return
3644.2.9 by John Arbash Meinel Refactor some code.	178	self._spill_mem_keys_to_disk()
	179
	180	def _spill_mem_keys_to_disk(self):
	181	"""Write the in memory keys down to disk to cap memory consumption.
	182
	183	If we already have some keys written to disk, we will combine them so
	184	as to preserve the sorted order. The algorithm for combining uses
	185	powers of two. So on the first spill, write all mem nodes into a
	186	single index. On the second spill, combine the mem nodes with the nodes
	187	on disk to create a 2x sized disk index and get rid of the first index.
	188	On the third spill, create a single new disk index, which will contain
	189	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	190	combine mem with the first and second indexes, creating a new one of
	191	size 4x. On the fifth create a single new one, etc.
	192	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	193	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	194	(new_backing_file, size,
	195	backing_pos) = self._spill_mem_keys_and_combine()
	196	else:
	197	new_backing_file, size = self._spill_mem_keys_without_combining()
	198	# Note: The transport here isn't strictly needed, because we will use
	199	# direct access to the new_backing._file object
6083.1.1 by Jelmer Vernooij Use get_transport_from_{url,path} in more places.	200	new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	201	'<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	202	# GC will clean up the file
	203	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	204	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	205	if len(self._backing_indices) == backing_pos:
	206	self._backing_indices.append(None)
	207	self._backing_indices[backing_pos] = new_backing
	208	for backing_pos in range(backing_pos):
	209	self._backing_indices[backing_pos] = None
	210	else:
	211	self._backing_indices.append(new_backing)
	212	self._nodes = {}
	213	self._nodes_by_key = None
	214
	215	def _spill_mem_keys_without_combining(self):
	216	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	217
	218	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	219	iterators_to_combine = [self._iter_mem_nodes()]
	220	pos = -1
	221	for pos, backing in enumerate(self._backing_indices):
	222	if backing is None:
	223	pos -= 1
	224	break
	225	iterators_to_combine.append(backing.iter_all_entries())
	226	backing_pos = pos + 1
	227	new_backing_file, size = \
	228	self._write_nodes(self._iter_smallest(iterators_to_combine),
	229	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	230	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	231
	232	def add_nodes(self, nodes):
	233	"""Add nodes to the index.
	234
	235	:param nodes: An iterable of (key, node_refs, value) entries to add.
	236	"""
	237	if self.reference_lists:
	238	for (key, value, node_refs) in nodes:
	239	self.add_node(key, value, node_refs)
	240	else:
	241	for (key, value) in nodes:
	242	self.add_node(key, value)
	243
	244	def _iter_mem_nodes(self):
	245	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	246	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	247	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	248	for key in sorted(nodes):
	249	references, value = nodes[key]
	250	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	251	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	252	for key in sorted(nodes):
	253	references, value = nodes[key]
	254	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	255
	256	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	257	if len(iterators_to_combine) == 1:
	258	for value in iterators_to_combine[0]:
	259	yield value
	260	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	261	current_values = []
	262	for iterator in iterators_to_combine:
	263	try:
	264	current_values.append(iterator.next())
	265	except StopIteration:
	266	current_values.append(None)
	267	last = None
	268	while True:
	269	# Decorate candidates with the value to allow 2.4's min to be used.
	270	candidates = [(item[1][1], item) for item
	271	in enumerate(current_values) if item[1] is not None]
	272	if not len(candidates):
	273	return
	274	selected = min(candidates)
	275	# undecorate back to (pos, node)
	276	selected = selected[1]
	277	if last == selected[1][1]:
	278	raise errors.BadIndexDuplicateKey(last, self)
	279	last = selected[1][1]
	280	# Yield, with self as the index
	281	yield (self,) + selected[1][1:]
	282	pos = selected[0]
	283	try:
	284	current_values[pos] = iterators_to_combine[pos].next()
	285	except StopIteration:
	286	current_values[pos] = None
	287
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	288	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	289	"""Add a key to the current chunk.
	290
	291	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	292	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	293	:param allow_optimize: If set to False, prevent setting the optimize
	294	flag when writing out. This is used by the _spill_mem_keys_to_disk
	295	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	296	"""
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	297	new_leaf = False
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	298	if rows[-1].writer is None:
	299	# opening a new leaf chunk;
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	300	new_leaf = True
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	301	for pos, internal_row in enumerate(rows[:-1]):
	302	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	303	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	304	if internal_row.writer is None:
	305	length = _PAGE_SIZE
	306	if internal_row.nodes == 0:
	307	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	308	if allow_optimize:
	309	optimize_for_size = self._optimize_for_size
	310	else:
	311	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	312	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	313	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	314	internal_row.writer.write(_INTERNAL_FLAG)
	315	internal_row.writer.write(_INTERNAL_OFFSET +
	316	str(rows[pos + 1].nodes) + "\n")
	317	# add a new leaf
	318	length = _PAGE_SIZE
	319	if rows[-1].nodes == 0:
	320	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	321	rows[-1].writer = chunk_writer.ChunkWriter(length,
	322	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	323	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	324	if rows[-1].writer.write(line):
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	325	# if we failed to write, despite having an empty page to write to,
	326	# then line is too big. raising the error avoids infinite recursion
	327	# searching for a suitably large page that will not be found.
	328	if new_leaf:
	329	raise errors.BadIndexKey(string_key)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	330	# this key did not fit in the node:
	331	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	332	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	333	new_row = True
	334	for row in reversed(rows[:-1]):
	335	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	336	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	337	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	338	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	339	row.finish_node()
	340	else:
	341	# We've found a node that can handle the pointer.
	342	new_row = False
	343	break
	344	# If we reached the current root without being able to mark the
	345	# division point, then we need a new root:
	346	if new_row:
	347	# We need a new row
	348	if 'index' in debug.debug_flags:
	349	trace.mutter('Inserting new global row.')
	350	new_row = _InternalBuilderRow()
	351	reserved_bytes = 0
	352	rows.insert(0, new_row)
	353	# This will be padded, hence the -100
	354	new_row.writer = chunk_writer.ChunkWriter(
	355	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	356	reserved_bytes,
	357	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	358	new_row.writer.write(_INTERNAL_FLAG)
	359	new_row.writer.write(_INTERNAL_OFFSET +
	360	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	361	new_row.writer.write(key_line)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	362	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	363
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	364	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	365	"""Write node_iterator out as a B+Tree.
	366
	367	:param node_iterator: An iterator of sorted nodes. Each node should
	368	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	369	:param allow_optimize: If set to False, prevent setting the optimize
	370	flag when writing out. This is used by the _spill_mem_keys_to_disk
	371	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	372	:return: A file handle for a temporary file containing a B+Tree for
	373	the nodes.
	374	"""
	375	# The index rows - rows[0] is the root, rows[1] is the layer under it
	376	# etc.
	377	rows = []
	378	# forward sorted by key. In future we may consider topological sorting,
	379	# at the cost of table scans for direct lookup, or a second index for
	380	# direct lookup
	381	key_count = 0
	382	# A stack with the number of nodes of each size. 0 is the root node
	383	# and must always be 1 (if there are any nodes in the tree).
	384	self.row_lengths = []
	385	# Loop over all nodes adding them to the bottom row
	386	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	387	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	388	# row above, transitively.
	389	for node in node_iterator:
	390	if key_count == 0:
	391	# First key triggers the first row
	392	rows.append(_LeafBuilderRow())
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	393	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	394	string_key, line = _btree_serializer._flatten_node(node,
	395	self.reference_lists)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	396	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	397	for row in reversed(rows):
	398	pad = (type(row) != _LeafBuilderRow)
	399	row.finish_node(pad=pad)
	400	lines = [_BTSIGNATURE]
	401	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	402	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	403	lines.append(_OPTION_LEN + str(key_count) + '\n')
	404	row_lengths = [row.nodes for row in rows]
	405	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	406	if row_lengths and row_lengths[-1] > 1:
	407	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	408	else:
	409	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	410	result.writelines(lines)
	411	position = sum(map(len, lines))
	412	root_row = True
	413	if position > _RESERVED_HEADER_BYTES:
	414	raise AssertionError("Could not fit the header in the"
	415	" reserved space: %d > %d"
	416	% (position, _RESERVED_HEADER_BYTES))
	417	# write the rows out:
	418	for row in rows:
	419	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	420	row.spool.flush()
	421	row.spool.seek(0)
	422	# copy nodes to the finalised file.
	423	# Special case the first node as it may be prefixed
	424	node = row.spool.read(_PAGE_SIZE)
	425	result.write(node[reserved:])
4771.3.1 by John Arbash Meinel We don't have to pad 'short' records.	426	if len(node) == _PAGE_SIZE:
	427	result.write("\x00" * (reserved - position))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	428	position = 0 # Only the root row actually has an offset
	429	copied_len = osutils.pumpfile(row.spool, result)
	430	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	431	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	432	raise AssertionError("Incorrect amount of data copied"
	433	" expected: %d, got: %d"
	434	% ((row.nodes - 1) * _PAGE_SIZE,
	435	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	436	result.flush()
	437	size = result.tell()
	438	result.seek(0)
	439	return result, size
	440
	441	def finish(self):
	442	"""Finalise the index.
	443
	444	:return: A file handle for a temporary file containing the nodes added
	445	to the index.
	446	"""
	447	return self._write_nodes(self.iter_all_entries())[0]
	448
	449	def iter_all_entries(self):
	450	"""Iterate over all keys within the index
	451
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	452	:return: An iterable of (index, key, value, reference_lists). There is
	453	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	454	efficient order for the index (in this case dictionary hash order).
	455	"""
	456	if 'evil' in debug.debug_flags:
	457	trace.mutter_callsite(3,
	458	"iter_all_entries scales with size of history.")
	459	# Doing serial rather than ordered would be faster; but this shouldn't
	460	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	461	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	462	for backing in self._backing_indices:
	463	if backing is not None:
	464	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	465	if len(iterators) == 1:
	466	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	467	return self._iter_smallest(iterators)
	468
	469	def iter_entries(self, keys):
	470	"""Iterate over keys within the index.
	471
	472	:param keys: An iterable providing the keys to be retrieved.
	473	:return: An iterable of (index, key, value, reference_lists). There is no
	474	defined order for the result iteration - it will be in the most
	475	efficient order for the index (keys iteration order in this case).
	476	"""
	477	keys = set(keys)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	478	# Note: We don't use keys.intersection() here. If you read the C api,
	479	# set.intersection(other) special cases when other is a set and
	480	# will iterate the smaller of the two and lookup in the other.
	481	# It does not do this for any other type (even dict, unlike
	482	# some other set functions.) Since we expect keys is generally <<
	483	# self._nodes, it is faster to iterate over it in a list
	484	# comprehension
	485	nodes = self._nodes
	486	local_keys = [key for key in keys if key in nodes]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	487	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	488	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	489	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	490	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	491	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	492	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	493	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	494	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	495	# Find things that are in backing indices that have not been handled
	496	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	497	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	498	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	499	# Remove all of the keys that we found locally
	500	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	501	for backing in self._backing_indices:
	502	if backing is None:
	503	continue
	504	if not keys:
	505	return
	506	for node in backing.iter_entries(keys):
	507	keys.remove(node[1])
	508	yield (self,) + node[1:]
	509
	510	def iter_entries_prefix(self, keys):
	511	"""Iterate over keys within the index using prefix matching.
	512
	513	Prefix matching is applied within the tuple of a key, not to within
	514	the bytestring of each key element. e.g. if you have the keys ('foo',
	515	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	516	only the former key is returned.
	517
	518	:param keys: An iterable providing the key prefixes to be retrieved.
	519	Each key prefix takes the form of a tuple the length of a key, but
	520	with the last N elements 'None' rather than a regular bytestring.
	521	The first element cannot be 'None'.
	522	:return: An iterable as per iter_all_entries, but restricted to the
	523	keys with a matching prefix to those supplied. No additional keys
	524	will be returned, and every match that is in the index will be
	525	returned.
	526	"""
	527	# XXX: To much duplication with the GraphIndex class; consider finding
	528	# a good place to pull out the actual common logic.
	529	keys = set(keys)
	530	if not keys:
	531	return
	532	for backing in self._backing_indices:
	533	if backing is None:
	534	continue
	535	for node in backing.iter_entries_prefix(keys):
	536	yield (self,) + node[1:]
	537	if self._key_length == 1:
	538	for key in keys:
	539	# sanity check
	540	if key[0] is None:
	541	raise errors.BadIndexKey(key)
	542	if len(key) != self._key_length:
	543	raise errors.BadIndexKey(key)
	544	try:
	545	node = self._nodes[key]
	546	except KeyError:
	547	continue
	548	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	549	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	550	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	551	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	552	return
	553	for key in keys:
	554	# sanity check
	555	if key[0] is None:
	556	raise errors.BadIndexKey(key)
	557	if len(key) != self._key_length:
	558	raise errors.BadIndexKey(key)
	559	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	560	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	561	elements = list(key)
	562	# find the subdict to return
	563	try:
	564	while len(elements) and elements[0] is not None:
	565	key_dict = key_dict[elements[0]]
	566	elements.pop(0)
	567	except KeyError:
	568	# a non-existant lookup.
	569	continue
	570	if len(elements):
	571	dicts = [key_dict]
	572	while dicts:
	573	key_dict = dicts.pop(-1)
	574	# can't be empty or would not exist
	575	item, value = key_dict.iteritems().next()
	576	if type(value) == dict:
	577	# push keys
	578	dicts.extend(key_dict.itervalues())
	579	else:
	580	# yield keys
	581	for value in key_dict.itervalues():
5088.1.1 by Jelmer Vernooij Force value to a tuple before concatenating with tuple that contains variables	582	yield (self, ) + tuple(value)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	583	else:
	584	yield (self, ) + key_dict
	585
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	586	def _get_nodes_by_key(self):
	587	if self._nodes_by_key is None:
	588	nodes_by_key = {}
	589	if self.reference_lists:
	590	for key, (references, value) in self._nodes.iteritems():
	591	key_dict = nodes_by_key
	592	for subkey in key[:-1]:
	593	key_dict = key_dict.setdefault(subkey, {})
	594	key_dict[key[-1]] = key, value, references
	595	else:
	596	for key, (references, value) in self._nodes.iteritems():
	597	key_dict = nodes_by_key
	598	for subkey in key[:-1]:
	599	key_dict = key_dict.setdefault(subkey, {})
	600	key_dict[key[-1]] = key, value
	601	self._nodes_by_key = nodes_by_key
	602	return self._nodes_by_key
	603
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	604	def key_count(self):
	605	"""Return an estimate of the number of keys in this index.
	606
	607	For InMemoryGraphIndex the estimate is exact.
	608	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	609	return len(self._nodes) + sum(backing.key_count() for backing in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	610	self._backing_indices if backing is not None)
	611
	612	def validate(self):
	613	"""In memory index's have no known corruption at the moment."""
	614
	615
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	616	class _LeafNode(dict):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	617	"""A leaf node for a serialised B+Tree index."""
	618
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	619	__slots__ = ('min_key', 'max_key', '_keys')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	620
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	621	def __init__(self, bytes, key_length, ref_list_length):
	622	"""Parse bytes to create a leaf node object."""
	623	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	624	key_list = _btree_serializer._parse_leaf_lines(bytes,
	625	key_length, ref_list_length)
	626	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	627	self.min_key = key_list[0][0]
	628	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	629	else:
	630	self.min_key = self.max_key = None
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	631	super(_LeafNode, self).__init__(key_list)
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	632	self._keys = dict(self)
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	633
	634	def all_items(self):
	635	"""Return a sorted list of (key, (value, refs)) items"""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	636	items = self.items()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	637	items.sort()
	638	return items
	639
	640	def all_keys(self):
	641	"""Return a sorted list of all keys."""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	642	keys = self.keys()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	643	keys.sort()
	644	return keys
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	645
	646
	647	class _InternalNode(object):
	648	"""An internal node for a serialised B+Tree index."""
	649
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	650	__slots__ = ('keys', 'offset')
	651
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	652	def __init__(self, bytes):
	653	"""Parse bytes to create an internal node object."""
	654	# splitlines mangles the \r delimiters.. don't use it.
	655	self.keys = self._parse_lines(bytes.split('\n'))
	656
	657	def _parse_lines(self, lines):
	658	nodes = []
	659	self.offset = int(lines[1][7:])
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	660	as_st = static_tuple.StaticTuple.from_sequence
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	661	for line in lines[2:]:
	662	if line == '':
	663	break
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	664	nodes.append(as_st(map(intern, line.split('\0'))).intern())
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	665	return nodes
	666
	667
	668	class BTreeGraphIndex(object):
	669	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	670
	671	Individual nodes are held in a LRU cache. This holds the root node in
	672	memory except when very large walks are done.
	673	"""
	674
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	675	def __init__(self, transport, name, size, unlimited_cache=False,
	676	offset=0):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	677	"""Create a B+Tree index object on the index name.
	678
	679	:param transport: The transport to read data for the index from.
	680	:param name: The file name of the index on transport.
	681	:param size: Optional size of the index in bytes. This allows
	682	compatibility with the GraphIndex API, as well as ensuring that
	683	the initial read (to read the root node header) can be done
	684	without over-reading even on empty indices, and on small indices
	685	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	686	:param unlimited_cache: If set to True, then instead of using an
	687	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	688	cache all leaf nodes.
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	689	:param offset: The start of the btree index data isn't byte 0 of the
	690	file. Instead it starts at some point later.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	691	"""
	692	self._transport = transport
	693	self._name = name
	694	self._size = size
	695	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	696	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	697	self._root_node = None
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	698	self._base_offset = offset
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	699	self._leaf_factory = _LeafNode
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	700	# Default max size is 100,000 leave values
	701	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	702	if unlimited_cache:
	703	self._leaf_node_cache = {}
	704	self._internal_node_cache = {}
	705	else:
	706	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	707	# We use a FIFO here just to prevent possible blowout. However, a
	708	# 300k record btree has only 3k leaf nodes, and only 20 internal
	709	# nodes. A value of 100 scales to ~100100100 = 1M records.
	710	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	711	self._key_count = None
	712	self._row_lengths = None
	713	self._row_offsets = None # Start of each row, [-1] is the end
	714
	715	def __eq__(self, other):
	716	"""Equal when self and other were created with the same parameters."""
	717	return (
	718	type(self) == type(other) and
	719	self._transport == other._transport and
	720	self._name == other._name and
	721	self._size == other._size)
	722
	723	def __ne__(self, other):
	724	return not self.__eq__(other)
	725
3763.8.12 by John Arbash Meinel Code cleanup.	726	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	727	"""Read nodes and cache them in the lru.
	728
	729	The nodes list supplied is sorted and then read from disk, each node
	730	being inserted it into the _node_cache.
	731
	732	Note: Asking for more nodes than the _node_cache can contain will
	733	result in some of the results being immediately discarded, to prevent
	734	this an assertion is raised if more nodes are asked for than are
	735	cachable.
	736
	737	:return: A dict of {node_pos: node}
	738	"""
	739	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	740	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	741	for node_pos, node in self._read_nodes(sorted(nodes)):
	742	if node_pos == 0: # Special case
	743	self._root_node = node
	744	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	745	if start_of_leaves is None:
	746	start_of_leaves = self._row_offsets[-2]
	747	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	748	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	749	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	750	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	751	found[node_pos] = node
	752	return found
	753
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	754	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	755	"""Convert transport's recommended_page_size into btree pages.
	756
	757	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	758	pages fit in that length.
	759	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	760	recommended_read = self._transport.recommended_page_size()
	761	recommended_pages = int(math.ceil(recommended_read /
	762	float(_PAGE_SIZE)))
	763	return recommended_pages
	764
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	765	def _compute_total_pages_in_index(self):
	766	"""How many pages are in the index.
	767
	768	If we have read the header we will use the value stored there.
	769	Otherwise it will be computed based on the length of the index.
	770	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	771	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	772	raise AssertionError('_compute_total_pages_in_index should not be'
	773	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	774	if self._root_node is not None:
	775	# This is the number of pages as defined by the header
	776	return self._row_offsets[-1]
	777	# This is the number of pages as defined by the size of the index. They
	778	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	779	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	780	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	781
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	782	def _expand_offsets(self, offsets):
	783	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	784
	785	The idea is that we always want to make big-enough requests (like 64kB
	786	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	787	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	788	out what other pages we might want to read.
	789
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	790	See also doc/developers/btree_index_prefetch.txt for more details.
	791
	792	:param offsets: The offsets to be read
	793	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	794	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	795	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	796	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	797
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	798	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	799	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	800	if 'index' in debug.debug_flags:
	801	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	802	len(offsets), self._recommended_pages)
	803	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	804	if self._size is None:
	805	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	806	if 'index' in debug.debug_flags:
	807	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	808	return offsets
	809	total_pages = self._compute_total_pages_in_index()
	810	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	811	# If reading recommended_pages would read the rest of the index, just
	812	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	813	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	814	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	815	if cached_offsets:
	816	expanded = [x for x in xrange(total_pages)
	817	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	818	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	819	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	820	if 'index' in debug.debug_flags:
	821	trace.mutter(' reading all unread pages: %s', expanded)
	822	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	823
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	824	if self._root_node is None:
	825	# ATM on the first read of the root node of a large index, we don't
	826	# bother pre-reading any other pages. This is because the
	827	# likelyhood of actually reading interesting pages is very low.
	828	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	829	# a possible implementation when we are guessing that the second
	830	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	831	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	832	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	833	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	834	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	835	# We haven't read enough to justify expansion
	836	# If we are only going to read the root node, and 1 leaf node,
	837	# then it isn't worth expanding our request. Once we've read at
	838	# least 2 nodes, then we are probably doing a search, and we
	839	# start expanding our requests.
	840	if 'index' in debug.debug_flags:
	841	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	842	return offsets
	843	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	844	total_pages)
	845
	846	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	847	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	848	trace.mutter('expanded: %s', final_offsets)
	849	return final_offsets
	850
	851	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	852	"""Expand requests to neighbors until we have enough pages.
	853
	854	This is called from _expand_offsets after policy has determined that we
	855	want to expand.
	856	We only want to expand requests within a given layer. We cheat a little
	857	bit and assume all requests will be in the same layer. This is true
	858	given the current design, but if it changes this algorithm may perform
	859	oddly.
	860
	861	:param offsets: requested offsets
	862	:param cached_offsets: offsets for pages we currently have cached
	863	:return: A set() of offsets after expansion
	864	"""
	865	final_offsets = set(offsets)
	866	first = end = None
	867	new_tips = set(final_offsets)
	868	while len(final_offsets) < self._recommended_pages and new_tips:
	869	next_tips = set()
	870	for pos in new_tips:
	871	if first is None:
	872	first, end = self._find_layer_first_and_end(pos)
	873	previous = pos - 1
	874	if (previous > 0
	875	and previous not in cached_offsets
	876	and previous not in final_offsets
	877	and previous >= first):
	878	next_tips.add(previous)
	879	after = pos + 1
	880	if (after < total_pages
	881	and after not in cached_offsets
	882	and after not in final_offsets
	883	and after < end):
	884	next_tips.add(after)
	885	# This would keep us from going bigger than
	886	# recommended_pages by only expanding the first offsets.
	887	# However, if we are making a 'wide' request, it is
	888	# reasonable to expand all points equally.
	889	# if len(final_offsets) > recommended_pages:
	890	# break
	891	final_offsets.update(next_tips)
	892	new_tips = next_tips
	893	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	894
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	895	def clear_cache(self):
	896	"""Clear out any cached/memoized values.
	897
	898	This can be called at any time, but generally it is used when we have
	899	extracted some information, but don't expect to be requesting any more
	900	from this index.
	901	"""
	902	# Note that we don't touch self._root_node or self._internal_node_cache
	903	# We don't expect either of those to be big, and it can save
	904	# round-trips in the future. We may re-evaluate this if InternalNode
	905	# memory starts to be an issue.
	906	self._leaf_node_cache.clear()
	907
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	908	def external_references(self, ref_list_num):
	909	if self._root_node is None:
	910	self._get_root_node()
	911	if ref_list_num + 1 > self.node_ref_lists:
	912	raise ValueError('No ref list %d, index has %d ref lists'
	913	% (ref_list_num, self.node_ref_lists))
	914	keys = set()
	915	refs = set()
	916	for node in self.iter_all_entries():
	917	keys.add(node[1])
	918	refs.update(node[3][ref_list_num])
	919	return refs - keys
	920
3763.8.12 by John Arbash Meinel Code cleanup.	921	def _find_layer_first_and_end(self, offset):
	922	"""Find the start/stop nodes for the layer corresponding to offset.
	923
	924	:return: (first, end)
	925	first is the first node in this layer
	926	end is the first node of the next layer
	927	"""
	928	first = end = 0
	929	for roffset in self._row_offsets:
	930	first = end
	931	end = roffset
	932	if offset < roffset:
	933	break
	934	return first, end
	935
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	936	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	937	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	938	cached_offsets = set(self._internal_node_cache.keys())
	939	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	940	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	941	cached_offsets.add(0)
	942	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	943
	944	def _get_root_node(self):
	945	if self._root_node is None:
	946	# We may not have a root node yet
	947	self._get_internal_nodes([0])
	948	return self._root_node
	949
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	950	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	951	found = {}
	952	needed = []
	953	for idx in node_indexes:
	954	if idx == 0 and self._root_node is not None:
	955	found[0] = self._root_node
	956	continue
	957	try:
	958	found[idx] = cache[idx]
	959	except KeyError:
	960	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	961	if not needed:
	962	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	963	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	964	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	965	return found
	966
	967	def _get_internal_nodes(self, node_indexes):
	968	"""Get a node, from cache or disk.
	969
	970	After getting it, the node will be cached.
	971	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	972	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	973
3805.4.6 by John Arbash Meinel refactor for clarity.	974	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	975	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	976	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	977	for node in nodes.itervalues():
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	978	for key, value in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	979	if key in self._leaf_value_cache:
	980	# Don't add the rest of the keys, we've seen this node
	981	# before.
	982	break
	983	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	984
	985	def _get_leaf_nodes(self, node_indexes):
	986	"""Get a bunch of nodes, from cache or disk."""
	987	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	988	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	989	return found
	990
	991	def iter_all_entries(self):
	992	"""Iterate over all keys within the index.
	993
	994	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	995	The former tuple is used when there are no reference lists in the
	996	index, making the API compatible with simple key:value index types.
	997	There is no defined order for the result iteration - it will be in
	998	the most efficient order for the index.
	999	"""
	1000	if 'evil' in debug.debug_flags:
	1001	trace.mutter_callsite(3,
	1002	"iter_all_entries scales with size of history.")
	1003	if not self.key_count():
	1004	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1005	if self._row_offsets[-1] == 1:
	1006	# There is only the root node, and we read that via key_count()
	1007	if self.node_ref_lists:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1008	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1009	yield (self, key, value, refs)
	1010	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1011	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1012	yield (self, key, value)
	1013	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1014	start_of_leaves = self._row_offsets[-2]
	1015	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1016	needed_offsets = range(start_of_leaves, end_of_leaves)
	1017	if needed_offsets == [0]:
	1018	# Special case when we only have a root node, as we have already
	1019	# read everything
	1020	nodes = [(0, self._root_node)]
	1021	else:
	1022	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1023	# We iterate strictly in-order so that we can use this function
	1024	# for spilling index builds to disk.
	1025	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1026	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1027	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1028	yield (self, key, value, refs)
	1029	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1030	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1031	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1032	yield (self, key, value)
	1033
	1034	@staticmethod
	1035	def _multi_bisect_right(in_keys, fixed_keys):
	1036	"""Find the positions where each 'in_key' would fit in fixed_keys.
	1037
	1038	This is equivalent to doing "bisect_right" on each in_key into
	1039	fixed_keys
	1040
	1041	:param in_keys: A sorted list of keys to match with fixed_keys
	1042	:param fixed_keys: A sorted list of keys to match against
	1043	:return: A list of (integer position, [key list]) tuples.
	1044	"""
	1045	if not in_keys:
	1046	return []
	1047	if not fixed_keys:
	1048	# no pointers in the fixed_keys list, which means everything must
	1049	# fall to the left.
	1050	return [(0, in_keys)]
	1051
	1052	# TODO: Iterating both lists will generally take M + N steps
	1053	# Bisecting each key will generally take M * log2 N steps.
	1054	# If we had an efficient way to compare, we could pick the method
	1055	# based on which has the fewer number of steps.
	1056	# There is also the argument that bisect_right is a compiled
	1057	# function, so there is even more to be gained.
	1058	# iter_steps = len(in_keys) + len(fixed_keys)
	1059	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1060	if len(in_keys) == 1: # Bisect will always be faster for M = 1
5753.2.4 by Jelmer Vernooij Review feedback from John.	1061	return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1062	# elif bisect_steps < iter_steps:
	1063	# offsets = {}
	1064	# for key in in_keys:
	1065	# offsets.setdefault(bisect_right(fixed_keys, key),
	1066	# []).append(key)
	1067	# return [(o, offsets[o]) for o in sorted(offsets)]
	1068	in_keys_iter = iter(in_keys)
	1069	fixed_keys_iter = enumerate(fixed_keys)
	1070	cur_in_key = in_keys_iter.next()
	1071	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1072
	1073	class InputDone(Exception): pass
	1074	class FixedDone(Exception): pass
	1075
	1076	output = []
	1077	cur_out = []
	1078
	1079	# TODO: Another possibility is that rather than iterating on each side,
	1080	# we could use a combination of bisecting and iterating. For
	1081	# example, while cur_in_key < fixed_key, bisect to find its
	1082	# point, then iterate all matching keys, then bisect (restricted
	1083	# to only the remainder) for the next one, etc.
	1084	try:
	1085	while True:
	1086	if cur_in_key < cur_fixed_key:
	1087	cur_keys = []
	1088	cur_out = (cur_fixed_offset, cur_keys)
	1089	output.append(cur_out)
	1090	while cur_in_key < cur_fixed_key:
	1091	cur_keys.append(cur_in_key)
	1092	try:
	1093	cur_in_key = in_keys_iter.next()
	1094	except StopIteration:
	1095	raise InputDone
	1096	# At this point cur_in_key must be >= cur_fixed_key
	1097	# step the cur_fixed_key until we pass the cur key, or walk off
	1098	# the end
	1099	while cur_in_key >= cur_fixed_key:
	1100	try:
	1101	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1102	except StopIteration:
	1103	raise FixedDone
	1104	except InputDone:
	1105	# We consumed all of the input, nothing more to do
	1106	pass
	1107	except FixedDone:
	1108	# There was some input left, but we consumed all of fixed, so we
	1109	# have to add one more for the tail
	1110	cur_keys = [cur_in_key]
	1111	cur_keys.extend(in_keys_iter)
	1112	cur_out = (len(fixed_keys), cur_keys)
	1113	output.append(cur_out)
	1114	return output
	1115
4593.4.5 by John Arbash Meinel Start adding some tests.	1116	def _walk_through_internal_nodes(self, keys):
	1117	"""Take the given set of keys, and find the corresponding LeafNodes.
	1118
	1119	:param keys: An unsorted iterable of keys to search for
	1120	:return: (nodes, index_and_keys)
	1121	nodes is a dict mapping {index: LeafNode}
	1122	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1123	"""
	1124	# 6 seconds spent in miss_torture using the sorted() line.
	1125	# Even with out of order disk IO it seems faster not to sort it when
	1126	# large queries are being made.
	1127	keys_at_index = [(0, sorted(keys))]
	1128
	1129	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1130	node_indexes = [idx for idx, s_keys in keys_at_index]
	1131	nodes = self._get_internal_nodes(node_indexes)
	1132
	1133	next_nodes_and_keys = []
	1134	for node_index, sub_keys in keys_at_index:
	1135	node = nodes[node_index]
	1136	positions = self._multi_bisect_right(sub_keys, node.keys)
	1137	node_offset = next_row_start + node.offset
	1138	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1139	for pos, s_keys in positions])
	1140	keys_at_index = next_nodes_and_keys
	1141	# We should now be at the _LeafNodes
	1142	node_indexes = [idx for idx, s_keys in keys_at_index]
	1143
	1144	# TODO: We may not want to always read all the nodes in one
	1145	# big go. Consider setting a max size on this.
	1146	nodes = self._get_leaf_nodes(node_indexes)
	1147	return nodes, keys_at_index
	1148
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1149	def iter_entries(self, keys):
	1150	"""Iterate over keys within the index.
	1151
	1152	:param keys: An iterable providing the keys to be retrieved.
	1153	:return: An iterable as per iter_all_entries, but restricted to the
	1154	keys supplied. No additional keys will be returned, and every
	1155	key supplied that is in the index will be returned.
	1156	"""
	1157	# 6 seconds spent in miss_torture using the sorted() line.
	1158	# Even with out of order disk IO it seems faster not to sort it when
	1159	# large queries are being made.
	1160	# However, now that we are doing multi-way bisecting, we need the keys
	1161	# in sorted order anyway. We could change the multi-way code to not
	1162	# require sorted order. (For example, it bisects for the first node,
	1163	# does an in-order search until a key comes before the current point,
	1164	# which it then bisects for, etc.)
	1165	keys = frozenset(keys)
	1166	if not keys:
	1167	return
	1168
	1169	if not self.key_count():
	1170	return
	1171
	1172	needed_keys = []
	1173	if self._leaf_value_cache is None:
	1174	needed_keys = keys
	1175	else:
	1176	for key in keys:
	1177	value = self._leaf_value_cache.get(key, None)
	1178	if value is not None:
	1179	# This key is known not to be here, skip it
	1180	value, refs = value
	1181	if self.node_ref_lists:
	1182	yield (self, key, value, refs)
	1183	else:
	1184	yield (self, key, value)
	1185	else:
	1186	needed_keys.append(key)
	1187
	1188	last_key = None
	1189	needed_keys = keys
	1190	if not needed_keys:
	1191	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1192	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1193	for node_index, sub_keys in nodes_and_keys:
	1194	if not sub_keys:
	1195	continue
	1196	node = nodes[node_index]
	1197	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1198	if next_sub_key in node:
	1199	value, refs = node[next_sub_key]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1200	if self.node_ref_lists:
	1201	yield (self, next_sub_key, value, refs)
	1202	else:
	1203	yield (self, next_sub_key, value)
	1204
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1205	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1206	"""Find the parent_map information for the set of keys.
	1207
	1208	This populates the parent_map dict and missing_keys set based on the
	1209	queried keys. It also can fill out an arbitrary number of parents that
	1210	it finds while searching for the supplied keys.
	1211
	1212	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1213	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1214
	1215	:param keys: A keys whose ancestry we want to return
	1216	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1217	:param ref_list_num: This index in the ref_lists is the parents we
	1218	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1219	:param parent_map: {key: parent_keys} for keys that are present in this
	1220	index. This may contain more entries than were in 'keys', that are
	1221	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1222	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1223	This may include parents that were not directly requested, but we
	1224	were able to determine that they are not present in this index.
	1225	:return: search_keys parents that were found but not queried to know
	1226	if they are missing or present. Callers can re-query this index for
	1227	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1228	"""
	1229	if not self.key_count():
	1230	# We use key_count() to trigger reading the root node and
	1231	# determining info about this BTreeGraphIndex
	1232	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1233	missing_keys.update(keys)
	1234	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1235	if ref_list_num >= self.node_ref_lists:
	1236	raise ValueError('No ref list %d, index has %d ref lists'
	1237	% (ref_list_num, self.node_ref_lists))
	1238
	1239	# The main trick we are trying to accomplish is that when we find a
	1240	# key listing its parents, we expect that the parent key is also likely
	1241	# to sit on the same page. Allowing us to expand parents quickly
	1242	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1243	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1244
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1245	# These are parent keys which could not be immediately resolved on the
	1246	# page where the child was present. Note that we may already be
	1247	# searching for that key, and it may actually be present [or known
	1248	# missing] on one of the other pages we are reading.
	1249	# TODO:
	1250	# We could try searching for them in the immediate previous or next
	1251	# page. If they occur "later" we could put them in a pending lookup
	1252	# set, and then for each node we read thereafter we could check to
	1253	# see if they are present.
	1254	# However, we don't know the impact of keeping this list of things
	1255	# that I'm going to search for every node I come across from here on
	1256	# out.
	1257	# It doesn't handle the case when the parent key is missing on a
	1258	# page that we don't read. So we already have to handle being
	1259	# re-entrant for that.
	1260	# Since most keys contain a date string, they are more likely to be
	1261	# found earlier in the file than later, but we would know that right
	1262	# away (key < min_key), and wouldn't keep searching it on every other
	1263	# page that we read.
	1264	# Mostly, it is an idea, one which should be benchmarked.
	1265	parents_not_on_page = set()
	1266
	1267	for node_index, sub_keys in nodes_and_keys:
	1268	if not sub_keys:
	1269	continue
	1270	# sub_keys is all of the keys we are looking for that should exist
	1271	# on this page, if they aren't here, then they won't be found
	1272	node = nodes[node_index]
	1273	parents_to_check = set()
	1274	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1275	if next_sub_key not in node:
4593.4.5 by John Arbash Meinel Start adding some tests.	1276	# This one is just not present in the index at all
	1277	missing_keys.add(next_sub_key)
	1278	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1279	value, refs = node[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1280	parent_keys = refs[ref_list_num]
	1281	parent_map[next_sub_key] = parent_keys
	1282	parents_to_check.update(parent_keys)
	1283	# Don't look for things we've already found
	1284	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1285	# this can be used to test the benefit of having the check loop
	1286	# inlined.
	1287	# parents_not_on_page.update(parents_to_check)
	1288	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1289	while parents_to_check:
	1290	next_parents_to_check = set()
	1291	for key in parents_to_check:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1292	if key in node:
	1293	value, refs = node[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1294	parent_keys = refs[ref_list_num]
	1295	parent_map[key] = parent_keys
	1296	next_parents_to_check.update(parent_keys)
	1297	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1298	# This parent either is genuinely missing, or should be
	1299	# found on another page. Perf test whether it is better
	1300	# to check if this node should fit on this page or not.
	1301	# in the 'everything-in-one-pack' scenario, this not
	1302	# doing the check is 237ms vs 243ms.
	1303	# So slightly better, but I assume the standard 'lots
	1304	# of packs' is going to show a reasonable improvement
	1305	# from the check, because it avoids 'going around
	1306	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1307	# parents_not_on_page.add(key)
	1308	# Missing for some reason
	1309	if key < node.min_key:
	1310	# in the case of bzr.dev, 3.4k/5.3k misses are
	1311	# 'earlier' misses (65%)
	1312	parents_not_on_page.add(key)
	1313	elif key > node.max_key:
	1314	# This parent key would be present on a different
	1315	# LeafNode
	1316	parents_not_on_page.add(key)
	1317	else:
	1318	# assert key != node.min_key and key != node.max_key
	1319	# If it was going to be present, it would be on
	1320	# this page, so mark it missing.
	1321	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1322	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1323	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1324	# parents_not_on_page could have been found on a different page, or be
	1325	# known to be missing. So cull out everything that has already been
	1326	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1327	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1328	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1329	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1330
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1331	def iter_entries_prefix(self, keys):
	1332	"""Iterate over keys within the index using prefix matching.
	1333
	1334	Prefix matching is applied within the tuple of a key, not to within
	1335	the bytestring of each key element. e.g. if you have the keys ('foo',
	1336	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1337	only the former key is returned.
	1338
	1339	WARNING: Note that this method currently causes a full index parse
	1340	unconditionally (which is reasonably appropriate as it is a means for
	1341	thunking many small indices into one larger one and still supplies
	1342	iter_all_entries at the thunk layer).
	1343
	1344	:param keys: An iterable providing the key prefixes to be retrieved.
	1345	Each key prefix takes the form of a tuple the length of a key, but
	1346	with the last N elements 'None' rather than a regular bytestring.
	1347	The first element cannot be 'None'.
	1348	:return: An iterable as per iter_all_entries, but restricted to the
	1349	keys with a matching prefix to those supplied. No additional keys
	1350	will be returned, and every match that is in the index will be
	1351	returned.
	1352	"""
	1353	keys = sorted(set(keys))
	1354	if not keys:
	1355	return
	1356	# Load if needed to check key lengths
	1357	if self._key_count is None:
	1358	self._get_root_node()
	1359	# TODO: only access nodes that can satisfy the prefixes we are looking
	1360	# for. For now, to meet API usage (as this function is not used by
	1361	# current bzrlib) just suck the entire index and iterate in memory.
	1362	nodes = {}
	1363	if self.node_ref_lists:
	1364	if self._key_length == 1:
	1365	for _1, key, value, refs in self.iter_all_entries():
	1366	nodes[key] = value, refs
	1367	else:
	1368	nodes_by_key = {}
	1369	for _1, key, value, refs in self.iter_all_entries():
	1370	key_value = key, value, refs
	1371	# For a key of (foo, bar, baz) create
	1372	# _nodes_by_key[foo][bar][baz] = key_value
	1373	key_dict = nodes_by_key
	1374	for subkey in key[:-1]:
	1375	key_dict = key_dict.setdefault(subkey, {})
	1376	key_dict[key[-1]] = key_value
	1377	else:
	1378	if self._key_length == 1:
	1379	for _1, key, value in self.iter_all_entries():
	1380	nodes[key] = value
	1381	else:
	1382	nodes_by_key = {}
	1383	for _1, key, value in self.iter_all_entries():
	1384	key_value = key, value
	1385	# For a key of (foo, bar, baz) create
	1386	# _nodes_by_key[foo][bar][baz] = key_value
	1387	key_dict = nodes_by_key
	1388	for subkey in key[:-1]:
	1389	key_dict = key_dict.setdefault(subkey, {})
	1390	key_dict[key[-1]] = key_value
	1391	if self._key_length == 1:
	1392	for key in keys:
	1393	# sanity check
	1394	if key[0] is None:
1395	raise errors.BadIndexKey(key)
1396	if len(key) != self._key_length:
1397	raise errors.BadIndexKey(key)
1398	try:
1399	if self.node_ref_lists:
1400	value, node_refs = nodes[key]
1401	yield self, key, value, node_refs
1402	else:
1403	yield self, key, nodes[key]
1404	except KeyError:
1405	pass
1406	return
1407	for key in keys:
1408	# sanity check
1409	if key[0] is None:
1410	raise errors.BadIndexKey(key)
1411	if len(key) != self._key_length:
1412	raise errors.BadIndexKey(key)
1413	# find what it refers to:
1414	key_dict = nodes_by_key
1415	elements = list(key)
1416	# find the subdict whose contents should be returned.
1417	try:
1418	while len(elements) and elements[0] is not None:
1419	key_dict = key_dict[elements[0]]
1420	elements.pop(0)
1421	except KeyError:
1422	# a non-existant lookup.
1423	continue
1424	if len(elements):
1425	dicts = [key_dict]
1426	while dicts:
1427	key_dict = dicts.pop(-1)
1428	# can't be empty or would not exist
1429	item, value = key_dict.iteritems().next()
1430	if type(value) == dict:
1431	# push keys
1432	dicts.extend(key_dict.itervalues())
1433	else:
1434	# yield keys
1435	for value in key_dict.itervalues():
1436	# each value is the key:value:node refs tuple
1437	# ready to yield.
1438	yield (self, ) + value
1439	else:
1440	# the last thing looked up was a terminal element
1441	yield (self, ) + key_dict
1442
1443	def key_count(self):
1444	"""Return an estimate of the number of keys in this index.
1445
1446	For BTreeGraphIndex the estimate is exact as it is contained in the
1447	header.
1448	"""
1449	if self._key_count is None:
1450	self._get_root_node()
1451	return self._key_count
1452
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1453	def _compute_row_offsets(self):
	1454	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1455	offsets = []
	1456	row_offset = 0
	1457	for row in self._row_lengths:
	1458	offsets.append(row_offset)
	1459	row_offset += row
	1460	offsets.append(row_offset)
	1461	self._row_offsets = offsets
	1462
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1463	def _parse_header_from_bytes(self, bytes):
	1464	"""Parse the header from a region of bytes.
	1465
	1466	:param bytes: The data to parse.
	1467	:return: An offset, data tuple such as readv yields, for the unparsed
	1468	data. (which may be of length 0).
	1469	"""
	1470	signature = bytes[0:len(self._signature())]
	1471	if not signature == self._signature():
	1472	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1473	lines = bytes[len(self._signature()):].splitlines()
	1474	options_line = lines[0]
	1475	if not options_line.startswith(_OPTION_NODE_REFS):
	1476	raise errors.BadIndexOptions(self)
	1477	try:
	1478	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1479	except ValueError:
	1480	raise errors.BadIndexOptions(self)
	1481	options_line = lines[1]
	1482	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1483	raise errors.BadIndexOptions(self)
	1484	try:
	1485	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1486	except ValueError:
	1487	raise errors.BadIndexOptions(self)
	1488	options_line = lines[2]
	1489	if not options_line.startswith(_OPTION_LEN):
	1490	raise errors.BadIndexOptions(self)
	1491	try:
	1492	self._key_count = int(options_line[len(_OPTION_LEN):])
	1493	except ValueError:
	1494	raise errors.BadIndexOptions(self)
	1495	options_line = lines[3]
	1496	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1497	raise errors.BadIndexOptions(self)
	1498	try:
	1499	self._row_lengths = map(int, [length for length in
	1500	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1501	if len(length)])
	1502	except ValueError:
	1503	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1504	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1505
	1506	# calculate the bytes we have processed
	1507	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1508	return header_end, bytes[header_end:]
	1509
	1510	def _read_nodes(self, nodes):
	1511	"""Read some nodes from disk into the LRU cache.
	1512
	1513	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1514	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1515	supplied order. If possible doing sort() on the list before requesting
	1516	a read may improve performance.
	1517
	1518	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1519	:return: None
	1520	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1521	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1522	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1523	# list of (offset, length) regions of the file that should, evenually
	1524	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1525	ranges = []
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1526	base_offset = self._base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1527	for index in nodes:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1528	offset = (index * _PAGE_SIZE)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1529	size = _PAGE_SIZE
	1530	if index == 0:
	1531	# Root node - special case
	1532	if self._size:
	1533	size = min(_PAGE_SIZE, self._size)
	1534	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1535	# The only case where we don't know the size, is for very
	1536	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1537	bytes = self._transport.get_bytes(self._name)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1538	num_bytes = len(bytes)
	1539	self._size = num_bytes - base_offset
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1540	# the whole thing should be parsed out of 'bytes'
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1541	ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
	1542	for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1543	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1544	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1545	if offset > self._size:
	1546	raise AssertionError('tried to read past the end'
	1547	' of the file %s > %s'
	1548	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1549	size = min(size, self._size - offset)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1550	ranges.append((base_offset + offset, size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1551	if not ranges:
	1552	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1553	elif bytes is not None:
	1554	# already have the whole file
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1555	data_ranges = [(start, bytes[start:start+size])
	1556	for start, size in ranges]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1557	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1558	data_ranges = self._transport.readv(self._name, ranges)
	1559	else:
	1560	data_ranges = []
	1561	for offset, size in ranges:
	1562	self._file.seek(offset)
	1563	data_ranges.append((offset, self._file.read(size)))
	1564	for offset, data in data_ranges:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1565	offset -= base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1566	if offset == 0:
	1567	# extract the header
	1568	offset, data = self._parse_header_from_bytes(data)
	1569	if len(data) == 0:
	1570	continue
	1571	bytes = zlib.decompress(data)
	1572	if bytes.startswith(_LEAF_FLAG):
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1573	node = self._leaf_factory(bytes, self._key_length,
	1574	self.node_ref_lists)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1575	elif bytes.startswith(_INTERNAL_FLAG):
	1576	node = _InternalNode(bytes)
	1577	else:
	1578	raise AssertionError("Unknown node type for %r" % bytes)
	1579	yield offset / _PAGE_SIZE, node
	1580
	1581	def _signature(self):
	1582	"""The file signature for this index type."""
	1583	return _BTSIGNATURE
	1584
	1585	def validate(self):
	1586	"""Validate that everything in the index can be accessed."""
	1587	# just read and parse every node.
	1588	self._get_root_node()
	1589	if len(self._row_lengths) > 1:
	1590	start_node = self._row_offsets[1]
	1591	else:
	1592	# We shouldn't be reading anything anyway
	1593	start_node = 1
	1594	node_end = self._row_offsets[-1]
	1595	for node in self._read_nodes(range(start_node, node_end)):
	1596	pass
	1597
	1598
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1599	_gcchk_factory = _LeafNode
	1600
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1601	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1602	from bzrlib import _btree_serializer_pyx as _btree_serializer
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1603	_gcchk_factory = _btree_serializer._parse_into_chk
4574.3.6 by Martin Pool More warnings when failing to load extensions	1604	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1605	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1606	from bzrlib import _btree_serializer_py as _btree_serializer