~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 6478)

5752.3.8 by John Arbash Meinel Merge bzr.dev 5764 to resolve release-notes (aka NEWS) conflicts	1	# Copyright (C) 2008-2011 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
6379.6.7 by Jelmer Vernooij Move importing from future until after doc string, otherwise the doc string will disappear.	18	"""B+Tree indices"""
	19
6379.6.1 by Jelmer Vernooij Import absolute_import in a few places.	20	from __future__ import absolute_import
	21
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	22	import cStringIO
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	23
	24	from bzrlib.lazy_import import lazy_import
	25	lazy_import(globals(), """
5753.2.4 by Jelmer Vernooij Review feedback from John.	26	import bisect
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	27	import math
	28	import tempfile
	29	import zlib
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	30	""")
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31
	32	from bzrlib import (
	33	chunk_writer,
	34	debug,
	35	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	36	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	37	index,
	38	lru_cache,
	39	osutils,
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	40	static_tuple,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	trace,
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	42	transport,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	43	)
	44	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	45
	46
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	47	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	48	_OPTION_ROW_LENGTHS = "row_lengths="
	49	_LEAF_FLAG = "type=leaf\n"
	50	_INTERNAL_FLAG = "type=internal\n"
	51	_INTERNAL_OFFSET = "offset="
	52
	53	_RESERVED_HEADER_BYTES = 120
	54	_PAGE_SIZE = 4096
	55
	56	# 4K per page: 4MB - 1000 entries
	57	_NODE_CACHE_SIZE = 1000
	58
	59
	60	class _BuilderRow(object):
	61	"""The stored state accumulated while writing out a row in the index.
	62
	63	:ivar spool: A temporary file used to accumulate nodes for this row
	64	in the tree.
	65	:ivar nodes: The count of nodes emitted so far.
	66	"""
	67
	68	def __init__(self):
	69	"""Create a _BuilderRow."""
	70	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	71	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	72	self.writer = None
	73
	74	def finish_node(self, pad=True):
	75	byte_lines, _, padding = self.writer.finish()
	76	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	77	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	# padded note:
	79	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	80	elif self.nodes == 1:
	81	# We got bigger than 1 node, switch to a temp file
	82	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	83	spool.write(self.spool.getvalue())
	84	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	85	skipped_bytes = 0
	86	if not pad and padding:
	87	del byte_lines[-1]
	88	skipped_bytes = padding
	89	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	90	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	91	if remainder != 0:
	92	raise AssertionError("incorrect node length: %d, %d"
	93	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	94	self.nodes += 1
	95	self.writer = None
	96
	97
	98	class _InternalBuilderRow(_BuilderRow):
	99	"""The stored state accumulated while writing out internal rows."""
	100
	101	def finish_node(self, pad=True):
	102	if not pad:
	103	raise AssertionError("Must pad internal nodes only.")
	104	_BuilderRow.finish_node(self)
	105
	106
	107	class _LeafBuilderRow(_BuilderRow):
	108	"""The stored state accumulated while writing out a leaf rows."""
	109
	110
	111	class BTreeBuilder(index.GraphIndexBuilder):
	112	"""A Builder for B+Tree based Graph indices.
	113
	114	The resulting graph has the structure:
	115
	116	_SIGNATURE OPTIONS NODES
	117	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	118	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	119	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	120	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	121	LENGTH := 'len=' DIGITS NEWLINE
	122	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	123	NODES := NODE_COMPRESSED*
	124	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	125	NODE_RAW := INTERNAL \| LEAF
	126	INTERNAL := INTERNAL_FLAG POINTERS
	127	LEAF := LEAF_FLAG ROWS
	128	KEY_ELEMENT := Not-whitespace-utf8
	129	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	130	ROWS := ROW*
	131	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	132	ABSENT := 'a'
	133	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	134	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	135	REFERENCE := KEY
	136	VALUE := no-newline-no-null-bytes
	137	"""
	138
	139	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	140	"""See GraphIndexBuilder.__init__.
	141
	142	:param spill_at: Optional parameter controlling the maximum number
	143	of nodes that BTreeBuilder will hold in memory.
	144	"""
	145	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	146	key_elements=key_elements)
	147	self._spill_at = spill_at
	148	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	149	# A map of {key: (node_refs, value)}
	150	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	151	# Indicate it hasn't been built yet
	152	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	153	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	154
	155	def add_node(self, key, value, references=()):
	156	"""Add a node to the index.
	157
	158	If adding the node causes the builder to reach its spill_at threshold,
	159	disk spilling will be triggered.
	160
	161	:param key: The key. keys are non-empty tuples containing
	162	as many whitespace-free utf8 bytestrings as the key length
	163	defined for this index.
	164	:param references: An iterable of iterables of keys. Each is a
	165	reference to another key.
	166	:param value: The value to associate with the key. It may be any
5891.1.3 by Andrew Bennetts Move docstring formatting fixes.	167	bytes as long as it does not contain \\0 or \\n.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	168	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	169	# Ensure that 'key' is a StaticTuple
	170	key = static_tuple.StaticTuple.from_sequence(key).intern()
3644.2.9 by John Arbash Meinel Refactor some code.	171	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	172	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	173	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	174	raise errors.BadIndexDuplicateKey(key, self)
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	175	self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
3644.2.9 by John Arbash Meinel Refactor some code.	176	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	177	self._update_nodes_by_key(key, value, node_refs)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	178	if len(self._nodes) < self._spill_at:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	179	return
3644.2.9 by John Arbash Meinel Refactor some code.	180	self._spill_mem_keys_to_disk()
	181
	182	def _spill_mem_keys_to_disk(self):
	183	"""Write the in memory keys down to disk to cap memory consumption.
	184
	185	If we already have some keys written to disk, we will combine them so
	186	as to preserve the sorted order. The algorithm for combining uses
	187	powers of two. So on the first spill, write all mem nodes into a
	188	single index. On the second spill, combine the mem nodes with the nodes
	189	on disk to create a 2x sized disk index and get rid of the first index.
	190	On the third spill, create a single new disk index, which will contain
	191	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	192	combine mem with the first and second indexes, creating a new one of
	193	size 4x. On the fifth create a single new one, etc.
	194	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	195	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	196	(new_backing_file, size,
	197	backing_pos) = self._spill_mem_keys_and_combine()
	198	else:
	199	new_backing_file, size = self._spill_mem_keys_without_combining()
	200	# Note: The transport here isn't strictly needed, because we will use
	201	# direct access to the new_backing._file object
6083.1.1 by Jelmer Vernooij Use get_transport_from_{url,path} in more places.	202	new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	203	'<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	204	# GC will clean up the file
	205	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	206	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	207	if len(self._backing_indices) == backing_pos:
	208	self._backing_indices.append(None)
	209	self._backing_indices[backing_pos] = new_backing
	210	for backing_pos in range(backing_pos):
	211	self._backing_indices[backing_pos] = None
	212	else:
	213	self._backing_indices.append(new_backing)
	214	self._nodes = {}
	215	self._nodes_by_key = None
	216
	217	def _spill_mem_keys_without_combining(self):
	218	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	219
	220	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	221	iterators_to_combine = [self._iter_mem_nodes()]
	222	pos = -1
	223	for pos, backing in enumerate(self._backing_indices):
	224	if backing is None:
	225	pos -= 1
	226	break
	227	iterators_to_combine.append(backing.iter_all_entries())
	228	backing_pos = pos + 1
	229	new_backing_file, size = \
	230	self._write_nodes(self._iter_smallest(iterators_to_combine),
	231	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	232	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	233
	234	def add_nodes(self, nodes):
	235	"""Add nodes to the index.
	236
	237	:param nodes: An iterable of (key, node_refs, value) entries to add.
	238	"""
	239	if self.reference_lists:
	240	for (key, value, node_refs) in nodes:
	241	self.add_node(key, value, node_refs)
	242	else:
	243	for (key, value) in nodes:
	244	self.add_node(key, value)
	245
	246	def _iter_mem_nodes(self):
	247	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	248	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	249	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	250	for key in sorted(nodes):
	251	references, value = nodes[key]
	252	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	253	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	254	for key in sorted(nodes):
	255	references, value = nodes[key]
	256	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	257
	258	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	259	if len(iterators_to_combine) == 1:
	260	for value in iterators_to_combine[0]:
	261	yield value
	262	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	263	current_values = []
	264	for iterator in iterators_to_combine:
	265	try:
	266	current_values.append(iterator.next())
	267	except StopIteration:
	268	current_values.append(None)
	269	last = None
	270	while True:
	271	# Decorate candidates with the value to allow 2.4's min to be used.
	272	candidates = [(item[1][1], item) for item
	273	in enumerate(current_values) if item[1] is not None]
	274	if not len(candidates):
	275	return
	276	selected = min(candidates)
	277	# undecorate back to (pos, node)
	278	selected = selected[1]
	279	if last == selected[1][1]:
	280	raise errors.BadIndexDuplicateKey(last, self)
	281	last = selected[1][1]
	282	# Yield, with self as the index
	283	yield (self,) + selected[1][1:]
	284	pos = selected[0]
	285	try:
	286	current_values[pos] = iterators_to_combine[pos].next()
	287	except StopIteration:
	288	current_values[pos] = None
	289
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	290	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	291	"""Add a key to the current chunk.
	292
	293	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	294	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	295	:param allow_optimize: If set to False, prevent setting the optimize
	296	flag when writing out. This is used by the _spill_mem_keys_to_disk
	297	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	298	"""
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	299	new_leaf = False
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	300	if rows[-1].writer is None:
	301	# opening a new leaf chunk;
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	302	new_leaf = True
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	303	for pos, internal_row in enumerate(rows[:-1]):
	304	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	305	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	306	if internal_row.writer is None:
	307	length = _PAGE_SIZE
	308	if internal_row.nodes == 0:
	309	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	310	if allow_optimize:
	311	optimize_for_size = self._optimize_for_size
	312	else:
	313	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	314	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	315	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	316	internal_row.writer.write(_INTERNAL_FLAG)
	317	internal_row.writer.write(_INTERNAL_OFFSET +
	318	str(rows[pos + 1].nodes) + "\n")
	319	# add a new leaf
	320	length = _PAGE_SIZE
	321	if rows[-1].nodes == 0:
	322	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	323	rows[-1].writer = chunk_writer.ChunkWriter(length,
	324	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	325	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	326	if rows[-1].writer.write(line):
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	327	# if we failed to write, despite having an empty page to write to,
	328	# then line is too big. raising the error avoids infinite recursion
	329	# searching for a suitably large page that will not be found.
	330	if new_leaf:
	331	raise errors.BadIndexKey(string_key)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	332	# this key did not fit in the node:
	333	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	334	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	335	new_row = True
	336	for row in reversed(rows[:-1]):
	337	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	338	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	339	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	340	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	341	row.finish_node()
	342	else:
	343	# We've found a node that can handle the pointer.
	344	new_row = False
	345	break
	346	# If we reached the current root without being able to mark the
	347	# division point, then we need a new root:
	348	if new_row:
	349	# We need a new row
	350	if 'index' in debug.debug_flags:
	351	trace.mutter('Inserting new global row.')
	352	new_row = _InternalBuilderRow()
	353	reserved_bytes = 0
	354	rows.insert(0, new_row)
	355	# This will be padded, hence the -100
	356	new_row.writer = chunk_writer.ChunkWriter(
	357	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	358	reserved_bytes,
	359	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	360	new_row.writer.write(_INTERNAL_FLAG)
	361	new_row.writer.write(_INTERNAL_OFFSET +
	362	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	363	new_row.writer.write(key_line)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	364	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	365
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	366	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	367	"""Write node_iterator out as a B+Tree.
	368
	369	:param node_iterator: An iterator of sorted nodes. Each node should
	370	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	371	:param allow_optimize: If set to False, prevent setting the optimize
	372	flag when writing out. This is used by the _spill_mem_keys_to_disk
	373	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	374	:return: A file handle for a temporary file containing a B+Tree for
	375	the nodes.
	376	"""
	377	# The index rows - rows[0] is the root, rows[1] is the layer under it
	378	# etc.
	379	rows = []
	380	# forward sorted by key. In future we may consider topological sorting,
	381	# at the cost of table scans for direct lookup, or a second index for
	382	# direct lookup
	383	key_count = 0
	384	# A stack with the number of nodes of each size. 0 is the root node
	385	# and must always be 1 (if there are any nodes in the tree).
	386	self.row_lengths = []
	387	# Loop over all nodes adding them to the bottom row
	388	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	389	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	390	# row above, transitively.
	391	for node in node_iterator:
	392	if key_count == 0:
	393	# First key triggers the first row
	394	rows.append(_LeafBuilderRow())
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	395	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	396	string_key, line = _btree_serializer._flatten_node(node,
	397	self.reference_lists)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	398	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	399	for row in reversed(rows):
	400	pad = (type(row) != _LeafBuilderRow)
	401	row.finish_node(pad=pad)
	402	lines = [_BTSIGNATURE]
	403	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	404	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	405	lines.append(_OPTION_LEN + str(key_count) + '\n')
	406	row_lengths = [row.nodes for row in rows]
	407	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	408	if row_lengths and row_lengths[-1] > 1:
	409	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	410	else:
	411	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	412	result.writelines(lines)
	413	position = sum(map(len, lines))
	414	root_row = True
	415	if position > _RESERVED_HEADER_BYTES:
	416	raise AssertionError("Could not fit the header in the"
	417	" reserved space: %d > %d"
	418	% (position, _RESERVED_HEADER_BYTES))
	419	# write the rows out:
	420	for row in rows:
	421	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	422	row.spool.flush()
	423	row.spool.seek(0)
	424	# copy nodes to the finalised file.
	425	# Special case the first node as it may be prefixed
	426	node = row.spool.read(_PAGE_SIZE)
	427	result.write(node[reserved:])
4771.3.1 by John Arbash Meinel We don't have to pad 'short' records.	428	if len(node) == _PAGE_SIZE:
	429	result.write("\x00" * (reserved - position))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	430	position = 0 # Only the root row actually has an offset
	431	copied_len = osutils.pumpfile(row.spool, result)
	432	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	433	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	434	raise AssertionError("Incorrect amount of data copied"
	435	" expected: %d, got: %d"
	436	% ((row.nodes - 1) * _PAGE_SIZE,
	437	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	438	result.flush()
	439	size = result.tell()
	440	result.seek(0)
	441	return result, size
	442
	443	def finish(self):
	444	"""Finalise the index.
	445
	446	:return: A file handle for a temporary file containing the nodes added
	447	to the index.
	448	"""
	449	return self._write_nodes(self.iter_all_entries())[0]
	450
	451	def iter_all_entries(self):
	452	"""Iterate over all keys within the index
	453
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	454	:return: An iterable of (index, key, value, reference_lists). There is
	455	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	456	efficient order for the index (in this case dictionary hash order).
	457	"""
	458	if 'evil' in debug.debug_flags:
	459	trace.mutter_callsite(3,
	460	"iter_all_entries scales with size of history.")
	461	# Doing serial rather than ordered would be faster; but this shouldn't
	462	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	463	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	464	for backing in self._backing_indices:
	465	if backing is not None:
	466	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	467	if len(iterators) == 1:
	468	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	469	return self._iter_smallest(iterators)
	470
	471	def iter_entries(self, keys):
	472	"""Iterate over keys within the index.
	473
	474	:param keys: An iterable providing the keys to be retrieved.
	475	:return: An iterable of (index, key, value, reference_lists). There is no
	476	defined order for the result iteration - it will be in the most
	477	efficient order for the index (keys iteration order in this case).
	478	"""
	479	keys = set(keys)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	480	# Note: We don't use keys.intersection() here. If you read the C api,
	481	# set.intersection(other) special cases when other is a set and
	482	# will iterate the smaller of the two and lookup in the other.
	483	# It does not do this for any other type (even dict, unlike
	484	# some other set functions.) Since we expect keys is generally <<
	485	# self._nodes, it is faster to iterate over it in a list
	486	# comprehension
	487	nodes = self._nodes
	488	local_keys = [key for key in keys if key in nodes]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	489	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	490	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	491	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	492	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	493	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	494	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	495	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	496	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	497	# Find things that are in backing indices that have not been handled
	498	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	499	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	500	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	501	# Remove all of the keys that we found locally
	502	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	503	for backing in self._backing_indices:
	504	if backing is None:
	505	continue
	506	if not keys:
	507	return
	508	for node in backing.iter_entries(keys):
	509	keys.remove(node[1])
	510	yield (self,) + node[1:]
	511
	512	def iter_entries_prefix(self, keys):
	513	"""Iterate over keys within the index using prefix matching.
	514
	515	Prefix matching is applied within the tuple of a key, not to within
	516	the bytestring of each key element. e.g. if you have the keys ('foo',
	517	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	518	only the former key is returned.
	519
	520	:param keys: An iterable providing the key prefixes to be retrieved.
	521	Each key prefix takes the form of a tuple the length of a key, but
	522	with the last N elements 'None' rather than a regular bytestring.
	523	The first element cannot be 'None'.
	524	:return: An iterable as per iter_all_entries, but restricted to the
	525	keys with a matching prefix to those supplied. No additional keys
	526	will be returned, and every match that is in the index will be
	527	returned.
	528	"""
	529	# XXX: To much duplication with the GraphIndex class; consider finding
	530	# a good place to pull out the actual common logic.
	531	keys = set(keys)
	532	if not keys:
	533	return
	534	for backing in self._backing_indices:
	535	if backing is None:
	536	continue
	537	for node in backing.iter_entries_prefix(keys):
	538	yield (self,) + node[1:]
	539	if self._key_length == 1:
	540	for key in keys:
	541	# sanity check
	542	if key[0] is None:
	543	raise errors.BadIndexKey(key)
	544	if len(key) != self._key_length:
	545	raise errors.BadIndexKey(key)
	546	try:
	547	node = self._nodes[key]
	548	except KeyError:
	549	continue
	550	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	551	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	552	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	553	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	554	return
	555	for key in keys:
	556	# sanity check
	557	if key[0] is None:
	558	raise errors.BadIndexKey(key)
	559	if len(key) != self._key_length:
	560	raise errors.BadIndexKey(key)
	561	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	562	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	563	elements = list(key)
	564	# find the subdict to return
	565	try:
	566	while len(elements) and elements[0] is not None:
	567	key_dict = key_dict[elements[0]]
	568	elements.pop(0)
	569	except KeyError:
	570	# a non-existant lookup.
	571	continue
	572	if len(elements):
	573	dicts = [key_dict]
	574	while dicts:
	575	key_dict = dicts.pop(-1)
	576	# can't be empty or would not exist
	577	item, value = key_dict.iteritems().next()
	578	if type(value) == dict:
	579	# push keys
	580	dicts.extend(key_dict.itervalues())
	581	else:
	582	# yield keys
	583	for value in key_dict.itervalues():
5088.1.1 by Jelmer Vernooij Force value to a tuple before concatenating with tuple that contains variables	584	yield (self, ) + tuple(value)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	585	else:
	586	yield (self, ) + key_dict
	587
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	588	def _get_nodes_by_key(self):
	589	if self._nodes_by_key is None:
	590	nodes_by_key = {}
	591	if self.reference_lists:
	592	for key, (references, value) in self._nodes.iteritems():
	593	key_dict = nodes_by_key
	594	for subkey in key[:-1]:
	595	key_dict = key_dict.setdefault(subkey, {})
	596	key_dict[key[-1]] = key, value, references
	597	else:
	598	for key, (references, value) in self._nodes.iteritems():
	599	key_dict = nodes_by_key
	600	for subkey in key[:-1]:
	601	key_dict = key_dict.setdefault(subkey, {})
	602	key_dict[key[-1]] = key, value
	603	self._nodes_by_key = nodes_by_key
	604	return self._nodes_by_key
	605
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	606	def key_count(self):
	607	"""Return an estimate of the number of keys in this index.
	608
	609	For InMemoryGraphIndex the estimate is exact.
	610	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	611	return len(self._nodes) + sum(backing.key_count() for backing in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	612	self._backing_indices if backing is not None)
	613
	614	def validate(self):
	615	"""In memory index's have no known corruption at the moment."""
	616
	617
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	618	class _LeafNode(dict):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	619	"""A leaf node for a serialised B+Tree index."""
	620
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	621	__slots__ = ('min_key', 'max_key', '_keys')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	622
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	623	def __init__(self, bytes, key_length, ref_list_length):
	624	"""Parse bytes to create a leaf node object."""
	625	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	626	key_list = _btree_serializer._parse_leaf_lines(bytes,
	627	key_length, ref_list_length)
	628	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	629	self.min_key = key_list[0][0]
	630	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	631	else:
	632	self.min_key = self.max_key = None
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	633	super(_LeafNode, self).__init__(key_list)
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	634	self._keys = dict(self)
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	635
	636	def all_items(self):
	637	"""Return a sorted list of (key, (value, refs)) items"""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	638	items = self.items()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	639	items.sort()
	640	return items
	641
	642	def all_keys(self):
	643	"""Return a sorted list of all keys."""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	644	keys = self.keys()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	645	keys.sort()
	646	return keys
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	647
	648
	649	class _InternalNode(object):
	650	"""An internal node for a serialised B+Tree index."""
	651
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	652	__slots__ = ('keys', 'offset')
	653
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	654	def __init__(self, bytes):
	655	"""Parse bytes to create an internal node object."""
	656	# splitlines mangles the \r delimiters.. don't use it.
	657	self.keys = self._parse_lines(bytes.split('\n'))
	658
	659	def _parse_lines(self, lines):
	660	nodes = []
	661	self.offset = int(lines[1][7:])
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	662	as_st = static_tuple.StaticTuple.from_sequence
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	663	for line in lines[2:]:
	664	if line == '':
	665	break
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	666	nodes.append(as_st(map(intern, line.split('\0'))).intern())
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	667	return nodes
	668
	669
	670	class BTreeGraphIndex(object):
	671	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	672
	673	Individual nodes are held in a LRU cache. This holds the root node in
	674	memory except when very large walks are done.
	675	"""
	676
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	677	def __init__(self, transport, name, size, unlimited_cache=False,
	678	offset=0):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	679	"""Create a B+Tree index object on the index name.
	680
	681	:param transport: The transport to read data for the index from.
	682	:param name: The file name of the index on transport.
	683	:param size: Optional size of the index in bytes. This allows
	684	compatibility with the GraphIndex API, as well as ensuring that
	685	the initial read (to read the root node header) can be done
	686	without over-reading even on empty indices, and on small indices
	687	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	688	:param unlimited_cache: If set to True, then instead of using an
	689	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	690	cache all leaf nodes.
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	691	:param offset: The start of the btree index data isn't byte 0 of the
	692	file. Instead it starts at some point later.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	693	"""
	694	self._transport = transport
	695	self._name = name
	696	self._size = size
	697	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	698	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	699	self._root_node = None
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	700	self._base_offset = offset
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	701	self._leaf_factory = _LeafNode
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	702	# Default max size is 100,000 leave values
	703	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	704	if unlimited_cache:
	705	self._leaf_node_cache = {}
	706	self._internal_node_cache = {}
	707	else:
	708	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	709	# We use a FIFO here just to prevent possible blowout. However, a
	710	# 300k record btree has only 3k leaf nodes, and only 20 internal
	711	# nodes. A value of 100 scales to ~100100100 = 1M records.
	712	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	713	self._key_count = None
	714	self._row_lengths = None
	715	self._row_offsets = None # Start of each row, [-1] is the end
	716
	717	def __eq__(self, other):
	718	"""Equal when self and other were created with the same parameters."""
	719	return (
	720	type(self) == type(other) and
	721	self._transport == other._transport and
	722	self._name == other._name and
	723	self._size == other._size)
	724
	725	def __ne__(self, other):
	726	return not self.__eq__(other)
	727
3763.8.12 by John Arbash Meinel Code cleanup.	728	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	729	"""Read nodes and cache them in the lru.
	730
	731	The nodes list supplied is sorted and then read from disk, each node
	732	being inserted it into the _node_cache.
	733
	734	Note: Asking for more nodes than the _node_cache can contain will
	735	result in some of the results being immediately discarded, to prevent
	736	this an assertion is raised if more nodes are asked for than are
	737	cachable.
	738
	739	:return: A dict of {node_pos: node}
	740	"""
	741	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	742	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	743	for node_pos, node in self._read_nodes(sorted(nodes)):
	744	if node_pos == 0: # Special case
	745	self._root_node = node
	746	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	747	if start_of_leaves is None:
	748	start_of_leaves = self._row_offsets[-2]
	749	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	750	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	751	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	752	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	753	found[node_pos] = node
	754	return found
	755
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	756	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	757	"""Convert transport's recommended_page_size into btree pages.
	758
	759	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	760	pages fit in that length.
	761	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	762	recommended_read = self._transport.recommended_page_size()
	763	recommended_pages = int(math.ceil(recommended_read /
	764	float(_PAGE_SIZE)))
	765	return recommended_pages
	766
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	767	def _compute_total_pages_in_index(self):
	768	"""How many pages are in the index.
	769
	770	If we have read the header we will use the value stored there.
	771	Otherwise it will be computed based on the length of the index.
	772	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	773	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	774	raise AssertionError('_compute_total_pages_in_index should not be'
	775	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	776	if self._root_node is not None:
	777	# This is the number of pages as defined by the header
	778	return self._row_offsets[-1]
	779	# This is the number of pages as defined by the size of the index. They
	780	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	781	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	782	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	783
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	784	def _expand_offsets(self, offsets):
	785	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	786
	787	The idea is that we always want to make big-enough requests (like 64kB
	788	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	789	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	790	out what other pages we might want to read.
	791
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	792	See also doc/developers/btree_index_prefetch.txt for more details.
	793
	794	:param offsets: The offsets to be read
	795	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	796	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	797	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	798	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	799
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	800	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	801	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	802	if 'index' in debug.debug_flags:
	803	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	804	len(offsets), self._recommended_pages)
	805	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	806	if self._size is None:
	807	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	808	if 'index' in debug.debug_flags:
	809	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	810	return offsets
	811	total_pages = self._compute_total_pages_in_index()
	812	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	813	# If reading recommended_pages would read the rest of the index, just
	814	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	815	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	816	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	817	if cached_offsets:
	818	expanded = [x for x in xrange(total_pages)
	819	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	820	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	821	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	822	if 'index' in debug.debug_flags:
	823	trace.mutter(' reading all unread pages: %s', expanded)
	824	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	825
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	826	if self._root_node is None:
	827	# ATM on the first read of the root node of a large index, we don't
	828	# bother pre-reading any other pages. This is because the
	829	# likelyhood of actually reading interesting pages is very low.
	830	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	831	# a possible implementation when we are guessing that the second
	832	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	833	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	834	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	835	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	836	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	837	# We haven't read enough to justify expansion
	838	# If we are only going to read the root node, and 1 leaf node,
	839	# then it isn't worth expanding our request. Once we've read at
	840	# least 2 nodes, then we are probably doing a search, and we
	841	# start expanding our requests.
	842	if 'index' in debug.debug_flags:
	843	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	844	return offsets
	845	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	846	total_pages)
	847
	848	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	849	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	850	trace.mutter('expanded: %s', final_offsets)
	851	return final_offsets
	852
	853	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	854	"""Expand requests to neighbors until we have enough pages.
	855
	856	This is called from _expand_offsets after policy has determined that we
	857	want to expand.
	858	We only want to expand requests within a given layer. We cheat a little
	859	bit and assume all requests will be in the same layer. This is true
	860	given the current design, but if it changes this algorithm may perform
	861	oddly.
	862
	863	:param offsets: requested offsets
	864	:param cached_offsets: offsets for pages we currently have cached
	865	:return: A set() of offsets after expansion
	866	"""
	867	final_offsets = set(offsets)
	868	first = end = None
	869	new_tips = set(final_offsets)
	870	while len(final_offsets) < self._recommended_pages and new_tips:
	871	next_tips = set()
	872	for pos in new_tips:
	873	if first is None:
	874	first, end = self._find_layer_first_and_end(pos)
	875	previous = pos - 1
	876	if (previous > 0
	877	and previous not in cached_offsets
	878	and previous not in final_offsets
	879	and previous >= first):
	880	next_tips.add(previous)
	881	after = pos + 1
	882	if (after < total_pages
	883	and after not in cached_offsets
	884	and after not in final_offsets
	885	and after < end):
	886	next_tips.add(after)
	887	# This would keep us from going bigger than
	888	# recommended_pages by only expanding the first offsets.
	889	# However, if we are making a 'wide' request, it is
	890	# reasonable to expand all points equally.
	891	# if len(final_offsets) > recommended_pages:
	892	# break
	893	final_offsets.update(next_tips)
	894	new_tips = next_tips
	895	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	896
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	897	def clear_cache(self):
	898	"""Clear out any cached/memoized values.
	899
	900	This can be called at any time, but generally it is used when we have
	901	extracted some information, but don't expect to be requesting any more
	902	from this index.
	903	"""
	904	# Note that we don't touch self._root_node or self._internal_node_cache
	905	# We don't expect either of those to be big, and it can save
	906	# round-trips in the future. We may re-evaluate this if InternalNode
	907	# memory starts to be an issue.
	908	self._leaf_node_cache.clear()
	909
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	910	def external_references(self, ref_list_num):
	911	if self._root_node is None:
	912	self._get_root_node()
	913	if ref_list_num + 1 > self.node_ref_lists:
	914	raise ValueError('No ref list %d, index has %d ref lists'
	915	% (ref_list_num, self.node_ref_lists))
	916	keys = set()
	917	refs = set()
	918	for node in self.iter_all_entries():
	919	keys.add(node[1])
	920	refs.update(node[3][ref_list_num])
	921	return refs - keys
	922
3763.8.12 by John Arbash Meinel Code cleanup.	923	def _find_layer_first_and_end(self, offset):
	924	"""Find the start/stop nodes for the layer corresponding to offset.
	925
	926	:return: (first, end)
	927	first is the first node in this layer
	928	end is the first node of the next layer
	929	"""
	930	first = end = 0
	931	for roffset in self._row_offsets:
	932	first = end
	933	end = roffset
	934	if offset < roffset:
	935	break
	936	return first, end
	937
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	938	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	939	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	940	cached_offsets = set(self._internal_node_cache.keys())
	941	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	942	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	943	cached_offsets.add(0)
	944	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	945
	946	def _get_root_node(self):
	947	if self._root_node is None:
	948	# We may not have a root node yet
	949	self._get_internal_nodes([0])
	950	return self._root_node
	951
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	952	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	953	found = {}
	954	needed = []
	955	for idx in node_indexes:
	956	if idx == 0 and self._root_node is not None:
	957	found[0] = self._root_node
	958	continue
	959	try:
	960	found[idx] = cache[idx]
	961	except KeyError:
	962	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	963	if not needed:
	964	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	965	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	966	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	967	return found
	968
	969	def _get_internal_nodes(self, node_indexes):
	970	"""Get a node, from cache or disk.
	971
	972	After getting it, the node will be cached.
	973	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	974	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	975
3805.4.6 by John Arbash Meinel refactor for clarity.	976	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	977	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	978	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	979	for node in nodes.itervalues():
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	980	for key, value in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	981	if key in self._leaf_value_cache:
	982	# Don't add the rest of the keys, we've seen this node
	983	# before.
	984	break
	985	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	986
	987	def _get_leaf_nodes(self, node_indexes):
	988	"""Get a bunch of nodes, from cache or disk."""
	989	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	990	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	991	return found
	992
	993	def iter_all_entries(self):
	994	"""Iterate over all keys within the index.
	995
	996	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	997	The former tuple is used when there are no reference lists in the
	998	index, making the API compatible with simple key:value index types.
	999	There is no defined order for the result iteration - it will be in
	1000	the most efficient order for the index.
	1001	"""
	1002	if 'evil' in debug.debug_flags:
	1003	trace.mutter_callsite(3,
	1004	"iter_all_entries scales with size of history.")
	1005	if not self.key_count():
	1006	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1007	if self._row_offsets[-1] == 1:
	1008	# There is only the root node, and we read that via key_count()
	1009	if self.node_ref_lists:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1010	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1011	yield (self, key, value, refs)
	1012	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1013	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1014	yield (self, key, value)
	1015	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1016	start_of_leaves = self._row_offsets[-2]
	1017	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1018	needed_offsets = range(start_of_leaves, end_of_leaves)
	1019	if needed_offsets == [0]:
	1020	# Special case when we only have a root node, as we have already
	1021	# read everything
	1022	nodes = [(0, self._root_node)]
	1023	else:
	1024	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1025	# We iterate strictly in-order so that we can use this function
	1026	# for spilling index builds to disk.
	1027	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1028	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1029	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1030	yield (self, key, value, refs)
	1031	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1032	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1033	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1034	yield (self, key, value)
	1035
	1036	@staticmethod
	1037	def _multi_bisect_right(in_keys, fixed_keys):
	1038	"""Find the positions where each 'in_key' would fit in fixed_keys.
	1039
	1040	This is equivalent to doing "bisect_right" on each in_key into
	1041	fixed_keys
	1042
	1043	:param in_keys: A sorted list of keys to match with fixed_keys
	1044	:param fixed_keys: A sorted list of keys to match against
	1045	:return: A list of (integer position, [key list]) tuples.
	1046	"""
	1047	if not in_keys:
	1048	return []
	1049	if not fixed_keys:
	1050	# no pointers in the fixed_keys list, which means everything must
	1051	# fall to the left.
	1052	return [(0, in_keys)]
	1053
	1054	# TODO: Iterating both lists will generally take M + N steps
	1055	# Bisecting each key will generally take M * log2 N steps.
	1056	# If we had an efficient way to compare, we could pick the method
	1057	# based on which has the fewer number of steps.
	1058	# There is also the argument that bisect_right is a compiled
	1059	# function, so there is even more to be gained.
	1060	# iter_steps = len(in_keys) + len(fixed_keys)
	1061	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1062	if len(in_keys) == 1: # Bisect will always be faster for M = 1
5753.2.4 by Jelmer Vernooij Review feedback from John.	1063	return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1064	# elif bisect_steps < iter_steps:
	1065	# offsets = {}
	1066	# for key in in_keys:
	1067	# offsets.setdefault(bisect_right(fixed_keys, key),
	1068	# []).append(key)
	1069	# return [(o, offsets[o]) for o in sorted(offsets)]
	1070	in_keys_iter = iter(in_keys)
	1071	fixed_keys_iter = enumerate(fixed_keys)
	1072	cur_in_key = in_keys_iter.next()
	1073	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1074
	1075	class InputDone(Exception): pass
	1076	class FixedDone(Exception): pass
	1077
	1078	output = []
	1079	cur_out = []
	1080
	1081	# TODO: Another possibility is that rather than iterating on each side,
	1082	# we could use a combination of bisecting and iterating. For
	1083	# example, while cur_in_key < fixed_key, bisect to find its
	1084	# point, then iterate all matching keys, then bisect (restricted
	1085	# to only the remainder) for the next one, etc.
	1086	try:
	1087	while True:
	1088	if cur_in_key < cur_fixed_key:
	1089	cur_keys = []
	1090	cur_out = (cur_fixed_offset, cur_keys)
	1091	output.append(cur_out)
	1092	while cur_in_key < cur_fixed_key:
	1093	cur_keys.append(cur_in_key)
	1094	try:
	1095	cur_in_key = in_keys_iter.next()
	1096	except StopIteration:
	1097	raise InputDone
	1098	# At this point cur_in_key must be >= cur_fixed_key
	1099	# step the cur_fixed_key until we pass the cur key, or walk off
	1100	# the end
	1101	while cur_in_key >= cur_fixed_key:
	1102	try:
	1103	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1104	except StopIteration:
	1105	raise FixedDone
	1106	except InputDone:
	1107	# We consumed all of the input, nothing more to do
	1108	pass
	1109	except FixedDone:
	1110	# There was some input left, but we consumed all of fixed, so we
	1111	# have to add one more for the tail
	1112	cur_keys = [cur_in_key]
	1113	cur_keys.extend(in_keys_iter)
	1114	cur_out = (len(fixed_keys), cur_keys)
	1115	output.append(cur_out)
	1116	return output
	1117
4593.4.5 by John Arbash Meinel Start adding some tests.	1118	def _walk_through_internal_nodes(self, keys):
	1119	"""Take the given set of keys, and find the corresponding LeafNodes.
	1120
	1121	:param keys: An unsorted iterable of keys to search for
	1122	:return: (nodes, index_and_keys)
	1123	nodes is a dict mapping {index: LeafNode}
	1124	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1125	"""
	1126	# 6 seconds spent in miss_torture using the sorted() line.
	1127	# Even with out of order disk IO it seems faster not to sort it when
	1128	# large queries are being made.
	1129	keys_at_index = [(0, sorted(keys))]
	1130
	1131	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1132	node_indexes = [idx for idx, s_keys in keys_at_index]
	1133	nodes = self._get_internal_nodes(node_indexes)
	1134
	1135	next_nodes_and_keys = []
	1136	for node_index, sub_keys in keys_at_index:
	1137	node = nodes[node_index]
	1138	positions = self._multi_bisect_right(sub_keys, node.keys)
	1139	node_offset = next_row_start + node.offset
	1140	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1141	for pos, s_keys in positions])
	1142	keys_at_index = next_nodes_and_keys
	1143	# We should now be at the _LeafNodes
	1144	node_indexes = [idx for idx, s_keys in keys_at_index]
	1145
	1146	# TODO: We may not want to always read all the nodes in one
	1147	# big go. Consider setting a max size on this.
	1148	nodes = self._get_leaf_nodes(node_indexes)
	1149	return nodes, keys_at_index
	1150
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1151	def iter_entries(self, keys):
	1152	"""Iterate over keys within the index.
	1153
	1154	:param keys: An iterable providing the keys to be retrieved.
	1155	:return: An iterable as per iter_all_entries, but restricted to the
	1156	keys supplied. No additional keys will be returned, and every
	1157	key supplied that is in the index will be returned.
	1158	"""
	1159	# 6 seconds spent in miss_torture using the sorted() line.
	1160	# Even with out of order disk IO it seems faster not to sort it when
	1161	# large queries are being made.
	1162	# However, now that we are doing multi-way bisecting, we need the keys
	1163	# in sorted order anyway. We could change the multi-way code to not
	1164	# require sorted order. (For example, it bisects for the first node,
	1165	# does an in-order search until a key comes before the current point,
	1166	# which it then bisects for, etc.)
	1167	keys = frozenset(keys)
	1168	if not keys:
	1169	return
	1170
	1171	if not self.key_count():
	1172	return
	1173
	1174	needed_keys = []
	1175	if self._leaf_value_cache is None:
	1176	needed_keys = keys
	1177	else:
	1178	for key in keys:
	1179	value = self._leaf_value_cache.get(key, None)
	1180	if value is not None:
	1181	# This key is known not to be here, skip it
	1182	value, refs = value
	1183	if self.node_ref_lists:
	1184	yield (self, key, value, refs)
	1185	else:
	1186	yield (self, key, value)
	1187	else:
	1188	needed_keys.append(key)
	1189
	1190	last_key = None
	1191	needed_keys = keys
	1192	if not needed_keys:
	1193	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1194	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1195	for node_index, sub_keys in nodes_and_keys:
	1196	if not sub_keys:
	1197	continue
	1198	node = nodes[node_index]
	1199	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1200	if next_sub_key in node:
	1201	value, refs = node[next_sub_key]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1202	if self.node_ref_lists:
	1203	yield (self, next_sub_key, value, refs)
	1204	else:
	1205	yield (self, next_sub_key, value)
	1206
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1207	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1208	"""Find the parent_map information for the set of keys.
	1209
	1210	This populates the parent_map dict and missing_keys set based on the
	1211	queried keys. It also can fill out an arbitrary number of parents that
	1212	it finds while searching for the supplied keys.
	1213
	1214	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1215	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1216
	1217	:param keys: A keys whose ancestry we want to return
	1218	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1219	:param ref_list_num: This index in the ref_lists is the parents we
	1220	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1221	:param parent_map: {key: parent_keys} for keys that are present in this
	1222	index. This may contain more entries than were in 'keys', that are
	1223	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1224	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1225	This may include parents that were not directly requested, but we
	1226	were able to determine that they are not present in this index.
	1227	:return: search_keys parents that were found but not queried to know
	1228	if they are missing or present. Callers can re-query this index for
	1229	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1230	"""
	1231	if not self.key_count():
	1232	# We use key_count() to trigger reading the root node and
	1233	# determining info about this BTreeGraphIndex
	1234	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1235	missing_keys.update(keys)
	1236	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1237	if ref_list_num >= self.node_ref_lists:
	1238	raise ValueError('No ref list %d, index has %d ref lists'
	1239	% (ref_list_num, self.node_ref_lists))
	1240
	1241	# The main trick we are trying to accomplish is that when we find a
	1242	# key listing its parents, we expect that the parent key is also likely
	1243	# to sit on the same page. Allowing us to expand parents quickly
	1244	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1245	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1246
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1247	# These are parent keys which could not be immediately resolved on the
	1248	# page where the child was present. Note that we may already be
	1249	# searching for that key, and it may actually be present [or known
	1250	# missing] on one of the other pages we are reading.
	1251	# TODO:
	1252	# We could try searching for them in the immediate previous or next
	1253	# page. If they occur "later" we could put them in a pending lookup
	1254	# set, and then for each node we read thereafter we could check to
	1255	# see if they are present.
	1256	# However, we don't know the impact of keeping this list of things
	1257	# that I'm going to search for every node I come across from here on
	1258	# out.
	1259	# It doesn't handle the case when the parent key is missing on a
	1260	# page that we don't read. So we already have to handle being
	1261	# re-entrant for that.
	1262	# Since most keys contain a date string, they are more likely to be
	1263	# found earlier in the file than later, but we would know that right
	1264	# away (key < min_key), and wouldn't keep searching it on every other
	1265	# page that we read.
	1266	# Mostly, it is an idea, one which should be benchmarked.
	1267	parents_not_on_page = set()
	1268
	1269	for node_index, sub_keys in nodes_and_keys:
	1270	if not sub_keys:
	1271	continue
	1272	# sub_keys is all of the keys we are looking for that should exist
	1273	# on this page, if they aren't here, then they won't be found
	1274	node = nodes[node_index]
	1275	parents_to_check = set()
	1276	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1277	if next_sub_key not in node:
4593.4.5 by John Arbash Meinel Start adding some tests.	1278	# This one is just not present in the index at all
	1279	missing_keys.add(next_sub_key)
	1280	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1281	value, refs = node[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1282	parent_keys = refs[ref_list_num]
	1283	parent_map[next_sub_key] = parent_keys
	1284	parents_to_check.update(parent_keys)
	1285	# Don't look for things we've already found
	1286	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1287	# this can be used to test the benefit of having the check loop
	1288	# inlined.
	1289	# parents_not_on_page.update(parents_to_check)
	1290	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1291	while parents_to_check:
	1292	next_parents_to_check = set()
	1293	for key in parents_to_check:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1294	if key in node:
	1295	value, refs = node[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1296	parent_keys = refs[ref_list_num]
	1297	parent_map[key] = parent_keys
	1298	next_parents_to_check.update(parent_keys)
	1299	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1300	# This parent either is genuinely missing, or should be
	1301	# found on another page. Perf test whether it is better
	1302	# to check if this node should fit on this page or not.
	1303	# in the 'everything-in-one-pack' scenario, this not
	1304	# doing the check is 237ms vs 243ms.
	1305	# So slightly better, but I assume the standard 'lots
	1306	# of packs' is going to show a reasonable improvement
	1307	# from the check, because it avoids 'going around
	1308	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1309	# parents_not_on_page.add(key)
	1310	# Missing for some reason
	1311	if key < node.min_key:
	1312	# in the case of bzr.dev, 3.4k/5.3k misses are
	1313	# 'earlier' misses (65%)
	1314	parents_not_on_page.add(key)
	1315	elif key > node.max_key:
	1316	# This parent key would be present on a different
	1317	# LeafNode
	1318	parents_not_on_page.add(key)
	1319	else:
	1320	# assert key != node.min_key and key != node.max_key
	1321	# If it was going to be present, it would be on
	1322	# this page, so mark it missing.
	1323	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1324	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1325	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1326	# parents_not_on_page could have been found on a different page, or be
	1327	# known to be missing. So cull out everything that has already been
	1328	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1329	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1330	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1331	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1332
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1333	def iter_entries_prefix(self, keys):
	1334	"""Iterate over keys within the index using prefix matching.
	1335
	1336	Prefix matching is applied within the tuple of a key, not to within
	1337	the bytestring of each key element. e.g. if you have the keys ('foo',
	1338	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1339	only the former key is returned.
	1340
	1341	WARNING: Note that this method currently causes a full index parse
	1342	unconditionally (which is reasonably appropriate as it is a means for
	1343	thunking many small indices into one larger one and still supplies
	1344	iter_all_entries at the thunk layer).
	1345
	1346	:param keys: An iterable providing the key prefixes to be retrieved.
	1347	Each key prefix takes the form of a tuple the length of a key, but
	1348	with the last N elements 'None' rather than a regular bytestring.
	1349	The first element cannot be 'None'.
	1350	:return: An iterable as per iter_all_entries, but restricted to the
	1351	keys with a matching prefix to those supplied. No additional keys
	1352	will be returned, and every match that is in the index will be
	1353	returned.
	1354	"""
	1355	keys = sorted(set(keys))
	1356	if not keys:
	1357	return
	1358	# Load if needed to check key lengths
	1359	if self._key_count is None:
	1360	self._get_root_node()
	1361	# TODO: only access nodes that can satisfy the prefixes we are looking
	1362	# for. For now, to meet API usage (as this function is not used by
	1363	# current bzrlib) just suck the entire index and iterate in memory.
	1364	nodes = {}
	1365	if self.node_ref_lists:
	1366	if self._key_length == 1:
	1367	for _1, key, value, refs in self.iter_all_entries():
	1368	nodes[key] = value, refs
	1369	else:
	1370	nodes_by_key = {}
	1371	for _1, key, value, refs in self.iter_all_entries():
	1372	key_value = key, value, refs
	1373	# For a key of (foo, bar, baz) create
	1374	# _nodes_by_key[foo][bar][baz] = key_value
	1375	key_dict = nodes_by_key
	1376	for subkey in key[:-1]:
	1377	key_dict = key_dict.setdefault(subkey, {})
	1378	key_dict[key[-1]] = key_value
	1379	else:
	1380	if self._key_length == 1:
	1381	for _1, key, value in self.iter_all_entries():
	1382	nodes[key] = value
	1383	else:
	1384	nodes_by_key = {}
	1385	for _1, key, value in self.iter_all_entries():
	1386	key_value = key, value
	1387	# For a key of (foo, bar, baz) create
	1388	# _nodes_by_key[foo][bar][baz] = key_value
	1389	key_dict = nodes_by_key
	1390	for subkey in key[:-1]:
	1391	key_dict = key_dict.setdefault(subkey, {})
	1392	key_dict[key[-1]] = key_value
	1393	if self._key_length == 1:
	1394	for key in keys:
	1395	# sanity check
	1396	if key[0] is None:
1397	raise errors.BadIndexKey(key)
1398	if len(key) != self._key_length:
1399	raise errors.BadIndexKey(key)
1400	try:
1401	if self.node_ref_lists:
1402	value, node_refs = nodes[key]
1403	yield self, key, value, node_refs
1404	else:
1405	yield self, key, nodes[key]
1406	except KeyError:
1407	pass
1408	return
1409	for key in keys:
1410	# sanity check
1411	if key[0] is None:
1412	raise errors.BadIndexKey(key)
1413	if len(key) != self._key_length:
1414	raise errors.BadIndexKey(key)
1415	# find what it refers to:
1416	key_dict = nodes_by_key
1417	elements = list(key)
1418	# find the subdict whose contents should be returned.
1419	try:
1420	while len(elements) and elements[0] is not None:
1421	key_dict = key_dict[elements[0]]
1422	elements.pop(0)
1423	except KeyError:
1424	# a non-existant lookup.
1425	continue
1426	if len(elements):
1427	dicts = [key_dict]
1428	while dicts:
1429	key_dict = dicts.pop(-1)
1430	# can't be empty or would not exist
1431	item, value = key_dict.iteritems().next()
1432	if type(value) == dict:
1433	# push keys
1434	dicts.extend(key_dict.itervalues())
1435	else:
1436	# yield keys
1437	for value in key_dict.itervalues():
1438	# each value is the key:value:node refs tuple
1439	# ready to yield.
1440	yield (self, ) + value
1441	else:
1442	# the last thing looked up was a terminal element
1443	yield (self, ) + key_dict
1444
1445	def key_count(self):
1446	"""Return an estimate of the number of keys in this index.
1447
1448	For BTreeGraphIndex the estimate is exact as it is contained in the
1449	header.
1450	"""
1451	if self._key_count is None:
1452	self._get_root_node()
1453	return self._key_count
1454
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1455	def _compute_row_offsets(self):
	1456	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1457	offsets = []
	1458	row_offset = 0
	1459	for row in self._row_lengths:
	1460	offsets.append(row_offset)
	1461	row_offset += row
	1462	offsets.append(row_offset)
	1463	self._row_offsets = offsets
	1464
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1465	def _parse_header_from_bytes(self, bytes):
	1466	"""Parse the header from a region of bytes.
	1467
	1468	:param bytes: The data to parse.
	1469	:return: An offset, data tuple such as readv yields, for the unparsed
	1470	data. (which may be of length 0).
	1471	"""
	1472	signature = bytes[0:len(self._signature())]
	1473	if not signature == self._signature():
	1474	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1475	lines = bytes[len(self._signature()):].splitlines()
	1476	options_line = lines[0]
	1477	if not options_line.startswith(_OPTION_NODE_REFS):
	1478	raise errors.BadIndexOptions(self)
	1479	try:
	1480	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1481	except ValueError:
	1482	raise errors.BadIndexOptions(self)
	1483	options_line = lines[1]
	1484	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1485	raise errors.BadIndexOptions(self)
	1486	try:
	1487	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1488	except ValueError:
	1489	raise errors.BadIndexOptions(self)
	1490	options_line = lines[2]
	1491	if not options_line.startswith(_OPTION_LEN):
	1492	raise errors.BadIndexOptions(self)
	1493	try:
	1494	self._key_count = int(options_line[len(_OPTION_LEN):])
	1495	except ValueError:
	1496	raise errors.BadIndexOptions(self)
	1497	options_line = lines[3]
	1498	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1499	raise errors.BadIndexOptions(self)
	1500	try:
	1501	self._row_lengths = map(int, [length for length in
	1502	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1503	if len(length)])
	1504	except ValueError:
	1505	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1506	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1507
	1508	# calculate the bytes we have processed
	1509	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1510	return header_end, bytes[header_end:]
	1511
	1512	def _read_nodes(self, nodes):
	1513	"""Read some nodes from disk into the LRU cache.
	1514
	1515	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1516	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1517	supplied order. If possible doing sort() on the list before requesting
	1518	a read may improve performance.
	1519
	1520	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1521	:return: None
	1522	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1523	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1524	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1525	# list of (offset, length) regions of the file that should, evenually
	1526	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1527	ranges = []
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1528	base_offset = self._base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1529	for index in nodes:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1530	offset = (index * _PAGE_SIZE)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1531	size = _PAGE_SIZE
	1532	if index == 0:
	1533	# Root node - special case
	1534	if self._size:
	1535	size = min(_PAGE_SIZE, self._size)
	1536	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1537	# The only case where we don't know the size, is for very
	1538	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1539	bytes = self._transport.get_bytes(self._name)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1540	num_bytes = len(bytes)
	1541	self._size = num_bytes - base_offset
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1542	# the whole thing should be parsed out of 'bytes'
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1543	ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
	1544	for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1545	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1546	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1547	if offset > self._size:
	1548	raise AssertionError('tried to read past the end'
	1549	' of the file %s > %s'
	1550	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1551	size = min(size, self._size - offset)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1552	ranges.append((base_offset + offset, size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1553	if not ranges:
	1554	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1555	elif bytes is not None:
	1556	# already have the whole file
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1557	data_ranges = [(start, bytes[start:start+size])
	1558	for start, size in ranges]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1559	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1560	data_ranges = self._transport.readv(self._name, ranges)
	1561	else:
	1562	data_ranges = []
	1563	for offset, size in ranges:
	1564	self._file.seek(offset)
	1565	data_ranges.append((offset, self._file.read(size)))
	1566	for offset, data in data_ranges:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1567	offset -= base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1568	if offset == 0:
	1569	# extract the header
	1570	offset, data = self._parse_header_from_bytes(data)
	1571	if len(data) == 0:
	1572	continue
	1573	bytes = zlib.decompress(data)
	1574	if bytes.startswith(_LEAF_FLAG):
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1575	node = self._leaf_factory(bytes, self._key_length,
	1576	self.node_ref_lists)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1577	elif bytes.startswith(_INTERNAL_FLAG):
	1578	node = _InternalNode(bytes)
	1579	else:
	1580	raise AssertionError("Unknown node type for %r" % bytes)
	1581	yield offset / _PAGE_SIZE, node
	1582
	1583	def _signature(self):
	1584	"""The file signature for this index type."""
	1585	return _BTSIGNATURE
	1586
	1587	def validate(self):
	1588	"""Validate that everything in the index can be accessed."""
	1589	# just read and parse every node.
	1590	self._get_root_node()
	1591	if len(self._row_lengths) > 1:
	1592	start_node = self._row_offsets[1]
	1593	else:
	1594	# We shouldn't be reading anything anyway
	1595	start_node = 1
	1596	node_end = self._row_offsets[-1]
	1597	for node in self._read_nodes(range(start_node, node_end)):
	1598	pass
	1599
	1600
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1601	_gcchk_factory = _LeafNode
	1602
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1603	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1604	from bzrlib import _btree_serializer_pyx as _btree_serializer
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1605	_gcchk_factory = _btree_serializer._parse_into_chk
4574.3.6 by Martin Pool More warnings when failing to load extensions	1606	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1607	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1608	from bzrlib import _btree_serializer_py as _btree_serializer