~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 4838)

3641.3.29 by John Arbash Meinel Cleanup the copyright headers	1	# Copyright (C) 2008 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	21	from bisect import bisect_right
	22	import math
	23	import tempfile
	24	import zlib
	25
	26	from bzrlib import (
	27	chunk_writer,
	28	debug,
	29	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	30	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31	index,
	32	lru_cache,
	33	osutils,
	34	trace,
	35	)
	36	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	37	from bzrlib.transport import get_transport
	38
	39
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	40	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	_OPTION_ROW_LENGTHS = "row_lengths="
	42	_LEAF_FLAG = "type=leaf\n"
	43	_INTERNAL_FLAG = "type=internal\n"
	44	_INTERNAL_OFFSET = "offset="
	45
	46	_RESERVED_HEADER_BYTES = 120
	47	_PAGE_SIZE = 4096
	48
	49	# 4K per page: 4MB - 1000 entries
	50	_NODE_CACHE_SIZE = 1000
	51
	52
	53	class _BuilderRow(object):
	54	"""The stored state accumulated while writing out a row in the index.
	55
	56	:ivar spool: A temporary file used to accumulate nodes for this row
	57	in the tree.
	58	:ivar nodes: The count of nodes emitted so far.
	59	"""
	60
	61	def __init__(self):
	62	"""Create a _BuilderRow."""
	63	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	64	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	65	self.writer = None
	66
	67	def finish_node(self, pad=True):
	68	byte_lines, _, padding = self.writer.finish()
	69	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	70	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	71	# padded note:
	72	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	73	elif self.nodes == 1:
	74	# We got bigger than 1 node, switch to a temp file
	75	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	76	spool.write(self.spool.getvalue())
	77	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	skipped_bytes = 0
	79	if not pad and padding:
	80	del byte_lines[-1]
	81	skipped_bytes = padding
	82	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	83	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	84	if remainder != 0:
	85	raise AssertionError("incorrect node length: %d, %d"
	86	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	87	self.nodes += 1
	88	self.writer = None
	89
	90
	91	class _InternalBuilderRow(_BuilderRow):
	92	"""The stored state accumulated while writing out internal rows."""
	93
	94	def finish_node(self, pad=True):
	95	if not pad:
	96	raise AssertionError("Must pad internal nodes only.")
	97	_BuilderRow.finish_node(self)
	98
	99
	100	class _LeafBuilderRow(_BuilderRow):
	101	"""The stored state accumulated while writing out a leaf rows."""
	102
	103
	104	class BTreeBuilder(index.GraphIndexBuilder):
	105	"""A Builder for B+Tree based Graph indices.
	106
	107	The resulting graph has the structure:
	108
	109	_SIGNATURE OPTIONS NODES
	110	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	111	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	112	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	113	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	114	LENGTH := 'len=' DIGITS NEWLINE
	115	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	116	NODES := NODE_COMPRESSED*
	117	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	118	NODE_RAW := INTERNAL \| LEAF
	119	INTERNAL := INTERNAL_FLAG POINTERS
	120	LEAF := LEAF_FLAG ROWS
	121	KEY_ELEMENT := Not-whitespace-utf8
	122	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	123	ROWS := ROW*
	124	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	125	ABSENT := 'a'
	126	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	127	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	128	REFERENCE := KEY
	129	VALUE := no-newline-no-null-bytes
	130	"""
	131
	132	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	133	"""See GraphIndexBuilder.__init__.
	134
	135	:param spill_at: Optional parameter controlling the maximum number
	136	of nodes that BTreeBuilder will hold in memory.
	137	"""
	138	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	139	key_elements=key_elements)
	140	self._spill_at = spill_at
	141	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	142	# A map of {key: (node_refs, value)}
	143	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	144	# Indicate it hasn't been built yet
	145	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	146	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	147
	148	def add_node(self, key, value, references=()):
	149	"""Add a node to the index.
	150
	151	If adding the node causes the builder to reach its spill_at threshold,
	152	disk spilling will be triggered.
	153
	154	:param key: The key. keys are non-empty tuples containing
	155	as many whitespace-free utf8 bytestrings as the key length
	156	defined for this index.
	157	:param references: An iterable of iterables of keys. Each is a
	158	reference to another key.
	159	:param value: The value to associate with the key. It may be any
	160	bytes as long as it does not contain \0 or \n.
	161	"""
3644.2.9 by John Arbash Meinel Refactor some code.	162	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	163	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	164	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	165	raise errors.BadIndexDuplicateKey(key, self)
4679.8.3 by John Arbash Meinel Expose bzrlib.static_tuple.StaticTuple as a thunk	166	# TODO: StaticTuple
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	167	self._nodes[key] = (node_refs, value)
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	168	self._keys.add(key)
3644.2.9 by John Arbash Meinel Refactor some code.	169	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	170	self._update_nodes_by_key(key, value, node_refs)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	171	if len(self._keys) < self._spill_at:
	172	return
3644.2.9 by John Arbash Meinel Refactor some code.	173	self._spill_mem_keys_to_disk()
	174
	175	def _spill_mem_keys_to_disk(self):
	176	"""Write the in memory keys down to disk to cap memory consumption.
	177
	178	If we already have some keys written to disk, we will combine them so
	179	as to preserve the sorted order. The algorithm for combining uses
	180	powers of two. So on the first spill, write all mem nodes into a
	181	single index. On the second spill, combine the mem nodes with the nodes
	182	on disk to create a 2x sized disk index and get rid of the first index.
	183	On the third spill, create a single new disk index, which will contain
	184	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	185	combine mem with the first and second indexes, creating a new one of
	186	size 4x. On the fifth create a single new one, etc.
	187	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	188	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	189	(new_backing_file, size,
	190	backing_pos) = self._spill_mem_keys_and_combine()
	191	else:
	192	new_backing_file, size = self._spill_mem_keys_without_combining()
	193	# Note: The transport here isn't strictly needed, because we will use
	194	# direct access to the new_backing._file object
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	195	new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	196	# GC will clean up the file
	197	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	198	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	199	if len(self._backing_indices) == backing_pos:
	200	self._backing_indices.append(None)
	201	self._backing_indices[backing_pos] = new_backing
	202	for backing_pos in range(backing_pos):
	203	self._backing_indices[backing_pos] = None
	204	else:
	205	self._backing_indices.append(new_backing)
	206	self._keys = set()
	207	self._nodes = {}
	208	self._nodes_by_key = None
	209
	210	def _spill_mem_keys_without_combining(self):
	211	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	212
	213	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	214	iterators_to_combine = [self._iter_mem_nodes()]
	215	pos = -1
	216	for pos, backing in enumerate(self._backing_indices):
	217	if backing is None:
	218	pos -= 1
	219	break
	220	iterators_to_combine.append(backing.iter_all_entries())
	221	backing_pos = pos + 1
	222	new_backing_file, size = \
	223	self._write_nodes(self._iter_smallest(iterators_to_combine),
	224	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	225	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	226
	227	def add_nodes(self, nodes):
	228	"""Add nodes to the index.
	229
	230	:param nodes: An iterable of (key, node_refs, value) entries to add.
	231	"""
	232	if self.reference_lists:
	233	for (key, value, node_refs) in nodes:
	234	self.add_node(key, value, node_refs)
	235	else:
	236	for (key, value) in nodes:
	237	self.add_node(key, value)
	238
	239	def _iter_mem_nodes(self):
	240	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	241	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	242	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	243	for key in sorted(nodes):
	244	references, value = nodes[key]
	245	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	246	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	247	for key in sorted(nodes):
	248	references, value = nodes[key]
	249	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	250
	251	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	252	if len(iterators_to_combine) == 1:
	253	for value in iterators_to_combine[0]:
	254	yield value
	255	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	256	current_values = []
	257	for iterator in iterators_to_combine:
	258	try:
	259	current_values.append(iterator.next())
	260	except StopIteration:
	261	current_values.append(None)
	262	last = None
	263	while True:
	264	# Decorate candidates with the value to allow 2.4's min to be used.
	265	candidates = [(item[1][1], item) for item
	266	in enumerate(current_values) if item[1] is not None]
	267	if not len(candidates):
	268	return
	269	selected = min(candidates)
	270	# undecorate back to (pos, node)
	271	selected = selected[1]
	272	if last == selected[1][1]:
	273	raise errors.BadIndexDuplicateKey(last, self)
	274	last = selected[1][1]
	275	# Yield, with self as the index
	276	yield (self,) + selected[1][1:]
	277	pos = selected[0]
	278	try:
	279	current_values[pos] = iterators_to_combine[pos].next()
	280	except StopIteration:
	281	current_values[pos] = None
	282
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	283	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	284	"""Add a key to the current chunk.
	285
	286	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	287	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	288	:param allow_optimize: If set to False, prevent setting the optimize
	289	flag when writing out. This is used by the _spill_mem_keys_to_disk
	290	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	291	"""
	292	if rows[-1].writer is None:
	293	# opening a new leaf chunk;
	294	for pos, internal_row in enumerate(rows[:-1]):
	295	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	296	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	297	if internal_row.writer is None:
	298	length = _PAGE_SIZE
	299	if internal_row.nodes == 0:
	300	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	301	if allow_optimize:
	302	optimize_for_size = self._optimize_for_size
	303	else:
	304	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	305	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	306	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	307	internal_row.writer.write(_INTERNAL_FLAG)
	308	internal_row.writer.write(_INTERNAL_OFFSET +
	309	str(rows[pos + 1].nodes) + "\n")
	310	# add a new leaf
	311	length = _PAGE_SIZE
	312	if rows[-1].nodes == 0:
	313	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	314	rows[-1].writer = chunk_writer.ChunkWriter(length,
	315	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	316	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	317	if rows[-1].writer.write(line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	318	# this key did not fit in the node:
	319	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	320	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	321	new_row = True
	322	for row in reversed(rows[:-1]):
	323	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	324	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	325	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	326	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	327	row.finish_node()
	328	else:
	329	# We've found a node that can handle the pointer.
	330	new_row = False
	331	break
	332	# If we reached the current root without being able to mark the
	333	# division point, then we need a new root:
	334	if new_row:
	335	# We need a new row
	336	if 'index' in debug.debug_flags:
	337	trace.mutter('Inserting new global row.')
	338	new_row = _InternalBuilderRow()
	339	reserved_bytes = 0
	340	rows.insert(0, new_row)
	341	# This will be padded, hence the -100
	342	new_row.writer = chunk_writer.ChunkWriter(
	343	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	344	reserved_bytes,
	345	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	346	new_row.writer.write(_INTERNAL_FLAG)
	347	new_row.writer.write(_INTERNAL_OFFSET +
	348	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	349	new_row.writer.write(key_line)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	350	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	351
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	352	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	353	"""Write node_iterator out as a B+Tree.
	354
	355	:param node_iterator: An iterator of sorted nodes. Each node should
	356	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	357	:param allow_optimize: If set to False, prevent setting the optimize
	358	flag when writing out. This is used by the _spill_mem_keys_to_disk
	359	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	360	:return: A file handle for a temporary file containing a B+Tree for
	361	the nodes.
	362	"""
	363	# The index rows - rows[0] is the root, rows[1] is the layer under it
	364	# etc.
	365	rows = []
	366	# forward sorted by key. In future we may consider topological sorting,
	367	# at the cost of table scans for direct lookup, or a second index for
	368	# direct lookup
	369	key_count = 0
	370	# A stack with the number of nodes of each size. 0 is the root node
	371	# and must always be 1 (if there are any nodes in the tree).
	372	self.row_lengths = []
	373	# Loop over all nodes adding them to the bottom row
	374	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	375	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	376	# row above, transitively.
	377	for node in node_iterator:
	378	if key_count == 0:
	379	# First key triggers the first row
	380	rows.append(_LeafBuilderRow())
	381	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	382	string_key, line = _btree_serializer._flatten_node(node,
	383	self.reference_lists)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	384	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	385	for row in reversed(rows):
	386	pad = (type(row) != _LeafBuilderRow)
	387	row.finish_node(pad=pad)
	388	lines = [_BTSIGNATURE]
	389	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	390	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	391	lines.append(_OPTION_LEN + str(key_count) + '\n')
	392	row_lengths = [row.nodes for row in rows]
	393	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	394	if row_lengths and row_lengths[-1] > 1:
	395	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	396	else:
	397	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	398	result.writelines(lines)
	399	position = sum(map(len, lines))
	400	root_row = True
	401	if position > _RESERVED_HEADER_BYTES:
	402	raise AssertionError("Could not fit the header in the"
	403	" reserved space: %d > %d"
	404	% (position, _RESERVED_HEADER_BYTES))
	405	# write the rows out:
	406	for row in rows:
	407	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	408	row.spool.flush()
	409	row.spool.seek(0)
	410	# copy nodes to the finalised file.
	411	# Special case the first node as it may be prefixed
	412	node = row.spool.read(_PAGE_SIZE)
	413	result.write(node[reserved:])
4771.3.1 by John Arbash Meinel We don't have to pad 'short' records.	414	if len(node) == _PAGE_SIZE:
	415	result.write("\x00" * (reserved - position))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	416	position = 0 # Only the root row actually has an offset
	417	copied_len = osutils.pumpfile(row.spool, result)
	418	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	419	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	420	raise AssertionError("Incorrect amount of data copied"
	421	" expected: %d, got: %d"
	422	% ((row.nodes - 1) * _PAGE_SIZE,
	423	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	424	result.flush()
	425	size = result.tell()
	426	result.seek(0)
	427	return result, size
	428
	429	def finish(self):
	430	"""Finalise the index.
	431
	432	:return: A file handle for a temporary file containing the nodes added
	433	to the index.
	434	"""
	435	return self._write_nodes(self.iter_all_entries())[0]
	436
	437	def iter_all_entries(self):
	438	"""Iterate over all keys within the index
	439
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	440	:return: An iterable of (index, key, value, reference_lists). There is
	441	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	442	efficient order for the index (in this case dictionary hash order).
	443	"""
	444	if 'evil' in debug.debug_flags:
	445	trace.mutter_callsite(3,
	446	"iter_all_entries scales with size of history.")
	447	# Doing serial rather than ordered would be faster; but this shouldn't
	448	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	449	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	450	for backing in self._backing_indices:
	451	if backing is not None:
	452	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	453	if len(iterators) == 1:
	454	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	455	return self._iter_smallest(iterators)
	456
	457	def iter_entries(self, keys):
	458	"""Iterate over keys within the index.
	459
	460	:param keys: An iterable providing the keys to be retrieved.
	461	:return: An iterable of (index, key, value, reference_lists). There is no
	462	defined order for the result iteration - it will be in the most
	463	efficient order for the index (keys iteration order in this case).
	464	"""
	465	keys = set(keys)
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	466	local_keys = keys.intersection(self._keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	467	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	468	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	469	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	470	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	471	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	472	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	473	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	474	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	475	# Find things that are in backing indices that have not been handled
	476	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	477	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	478	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	479	# Remove all of the keys that we found locally
	480	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	481	for backing in self._backing_indices:
	482	if backing is None:
	483	continue
	484	if not keys:
	485	return
	486	for node in backing.iter_entries(keys):
	487	keys.remove(node[1])
	488	yield (self,) + node[1:]
	489
	490	def iter_entries_prefix(self, keys):
	491	"""Iterate over keys within the index using prefix matching.
	492
	493	Prefix matching is applied within the tuple of a key, not to within
	494	the bytestring of each key element. e.g. if you have the keys ('foo',
	495	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	496	only the former key is returned.
	497
	498	:param keys: An iterable providing the key prefixes to be retrieved.
	499	Each key prefix takes the form of a tuple the length of a key, but
	500	with the last N elements 'None' rather than a regular bytestring.
	501	The first element cannot be 'None'.
	502	:return: An iterable as per iter_all_entries, but restricted to the
	503	keys with a matching prefix to those supplied. No additional keys
	504	will be returned, and every match that is in the index will be
	505	returned.
	506	"""
	507	# XXX: To much duplication with the GraphIndex class; consider finding
	508	# a good place to pull out the actual common logic.
	509	keys = set(keys)
	510	if not keys:
	511	return
	512	for backing in self._backing_indices:
	513	if backing is None:
	514	continue
	515	for node in backing.iter_entries_prefix(keys):
	516	yield (self,) + node[1:]
	517	if self._key_length == 1:
	518	for key in keys:
	519	# sanity check
	520	if key[0] is None:
	521	raise errors.BadIndexKey(key)
	522	if len(key) != self._key_length:
	523	raise errors.BadIndexKey(key)
	524	try:
	525	node = self._nodes[key]
	526	except KeyError:
	527	continue
	528	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	529	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	530	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	531	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	532	return
	533	for key in keys:
	534	# sanity check
	535	if key[0] is None:
	536	raise errors.BadIndexKey(key)
	537	if len(key) != self._key_length:
	538	raise errors.BadIndexKey(key)
	539	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	540	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	541	elements = list(key)
	542	# find the subdict to return
	543	try:
	544	while len(elements) and elements[0] is not None:
	545	key_dict = key_dict[elements[0]]
	546	elements.pop(0)
	547	except KeyError:
	548	# a non-existant lookup.
	549	continue
	550	if len(elements):
	551	dicts = [key_dict]
	552	while dicts:
	553	key_dict = dicts.pop(-1)
	554	# can't be empty or would not exist
	555	item, value = key_dict.iteritems().next()
	556	if type(value) == dict:
	557	# push keys
	558	dicts.extend(key_dict.itervalues())
	559	else:
	560	# yield keys
	561	for value in key_dict.itervalues():
	562	yield (self, ) + value
	563	else:
	564	yield (self, ) + key_dict
	565
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	566	def _get_nodes_by_key(self):
	567	if self._nodes_by_key is None:
	568	nodes_by_key = {}
	569	if self.reference_lists:
	570	for key, (references, value) in self._nodes.iteritems():
	571	key_dict = nodes_by_key
	572	for subkey in key[:-1]:
	573	key_dict = key_dict.setdefault(subkey, {})
	574	key_dict[key[-1]] = key, value, references
	575	else:
	576	for key, (references, value) in self._nodes.iteritems():
	577	key_dict = nodes_by_key
	578	for subkey in key[:-1]:
	579	key_dict = key_dict.setdefault(subkey, {})
	580	key_dict[key[-1]] = key, value
	581	self._nodes_by_key = nodes_by_key
	582	return self._nodes_by_key
	583
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	584	def key_count(self):
	585	"""Return an estimate of the number of keys in this index.
	586
	587	For InMemoryGraphIndex the estimate is exact.
	588	"""
	589	return len(self._keys) + sum(backing.key_count() for backing in
	590	self._backing_indices if backing is not None)
	591
	592	def validate(self):
	593	"""In memory index's have no known corruption at the moment."""
	594
	595
	596	class _LeafNode(object):
	597	"""A leaf node for a serialised B+Tree index."""
	598
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	599	__slots__ = ('keys', 'min_key', 'max_key')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	600
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	601	def __init__(self, bytes, key_length, ref_list_length):
	602	"""Parse bytes to create a leaf node object."""
	603	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	604	key_list = _btree_serializer._parse_leaf_lines(bytes,
	605	key_length, ref_list_length)
	606	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	607	self.min_key = key_list[0][0]
	608	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	609	else:
	610	self.min_key = self.max_key = None
	611	self.keys = dict(key_list)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	612
	613
	614	class _InternalNode(object):
	615	"""An internal node for a serialised B+Tree index."""
	616
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	617	__slots__ = ('keys', 'offset')
	618
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	619	def __init__(self, bytes):
	620	"""Parse bytes to create an internal node object."""
	621	# splitlines mangles the \r delimiters.. don't use it.
	622	self.keys = self._parse_lines(bytes.split('\n'))
	623
	624	def _parse_lines(self, lines):
	625	nodes = []
	626	self.offset = int(lines[1][7:])
	627	for line in lines[2:]:
	628	if line == '':
	629	break
4679.8.3 by John Arbash Meinel Expose bzrlib.static_tuple.StaticTuple as a thunk	630	# TODO: Switch to StaticTuple here.
4075.3.1 by John Arbash Meinel Use PyString_InternInPlace to intern() the various parts of keys that are processed.	631	nodes.append(tuple(map(intern, line.split('\0'))))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	632	return nodes
	633
	634
	635	class BTreeGraphIndex(object):
	636	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	637
	638	Individual nodes are held in a LRU cache. This holds the root node in
	639	memory except when very large walks are done.
	640	"""
	641
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	642	def __init__(self, transport, name, size, unlimited_cache=False):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	643	"""Create a B+Tree index object on the index name.
	644
	645	:param transport: The transport to read data for the index from.
	646	:param name: The file name of the index on transport.
	647	:param size: Optional size of the index in bytes. This allows
	648	compatibility with the GraphIndex API, as well as ensuring that
	649	the initial read (to read the root node header) can be done
	650	without over-reading even on empty indices, and on small indices
	651	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	652	:param unlimited_cache: If set to True, then instead of using an
	653	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	654	cache all leaf nodes.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	655	"""
	656	self._transport = transport
	657	self._name = name
	658	self._size = size
	659	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	660	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	661	self._root_node = None
	662	# Default max size is 100,000 leave values
	663	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	664	if unlimited_cache:
	665	self._leaf_node_cache = {}
	666	self._internal_node_cache = {}
	667	else:
	668	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	669	# We use a FIFO here just to prevent possible blowout. However, a
	670	# 300k record btree has only 3k leaf nodes, and only 20 internal
	671	# nodes. A value of 100 scales to ~100100100 = 1M records.
	672	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	673	self._key_count = None
	674	self._row_lengths = None
	675	self._row_offsets = None # Start of each row, [-1] is the end
	676
	677	def __eq__(self, other):
	678	"""Equal when self and other were created with the same parameters."""
	679	return (
	680	type(self) == type(other) and
	681	self._transport == other._transport and
	682	self._name == other._name and
	683	self._size == other._size)
	684
	685	def __ne__(self, other):
	686	return not self.__eq__(other)
	687
3763.8.12 by John Arbash Meinel Code cleanup.	688	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	689	"""Read nodes and cache them in the lru.
	690
	691	The nodes list supplied is sorted and then read from disk, each node
	692	being inserted it into the _node_cache.
	693
	694	Note: Asking for more nodes than the _node_cache can contain will
	695	result in some of the results being immediately discarded, to prevent
	696	this an assertion is raised if more nodes are asked for than are
	697	cachable.
	698
	699	:return: A dict of {node_pos: node}
	700	"""
	701	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	702	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	703	for node_pos, node in self._read_nodes(sorted(nodes)):
	704	if node_pos == 0: # Special case
	705	self._root_node = node
	706	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	707	if start_of_leaves is None:
	708	start_of_leaves = self._row_offsets[-2]
	709	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	710	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	711	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	712	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	713	found[node_pos] = node
	714	return found
	715
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	716	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	717	"""Convert transport's recommended_page_size into btree pages.
	718
	719	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	720	pages fit in that length.
	721	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	722	recommended_read = self._transport.recommended_page_size()
	723	recommended_pages = int(math.ceil(recommended_read /
	724	float(_PAGE_SIZE)))
	725	return recommended_pages
	726
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	727	def _compute_total_pages_in_index(self):
	728	"""How many pages are in the index.
	729
	730	If we have read the header we will use the value stored there.
	731	Otherwise it will be computed based on the length of the index.
	732	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	733	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	734	raise AssertionError('_compute_total_pages_in_index should not be'
	735	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	736	if self._root_node is not None:
	737	# This is the number of pages as defined by the header
	738	return self._row_offsets[-1]
	739	# This is the number of pages as defined by the size of the index. They
	740	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	741	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	742	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	743
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	744	def _expand_offsets(self, offsets):
	745	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	746
	747	The idea is that we always want to make big-enough requests (like 64kB
	748	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	749	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	750	out what other pages we might want to read.
	751
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	752	See also doc/developers/btree_index_prefetch.txt for more details.
	753
	754	:param offsets: The offsets to be read
	755	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	756	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	757	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	758	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	759
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	760	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	761	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	762	if 'index' in debug.debug_flags:
	763	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	764	len(offsets), self._recommended_pages)
	765	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	766	if self._size is None:
	767	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	768	if 'index' in debug.debug_flags:
	769	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	770	return offsets
	771	total_pages = self._compute_total_pages_in_index()
	772	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	773	# If reading recommended_pages would read the rest of the index, just
	774	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	775	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	776	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	777	if cached_offsets:
	778	expanded = [x for x in xrange(total_pages)
	779	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	780	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	781	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	782	if 'index' in debug.debug_flags:
	783	trace.mutter(' reading all unread pages: %s', expanded)
	784	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	785
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	786	if self._root_node is None:
	787	# ATM on the first read of the root node of a large index, we don't
	788	# bother pre-reading any other pages. This is because the
	789	# likelyhood of actually reading interesting pages is very low.
	790	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	791	# a possible implementation when we are guessing that the second
	792	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	793	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	794	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	795	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	796	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	797	# We haven't read enough to justify expansion
	798	# If we are only going to read the root node, and 1 leaf node,
	799	# then it isn't worth expanding our request. Once we've read at
	800	# least 2 nodes, then we are probably doing a search, and we
	801	# start expanding our requests.
	802	if 'index' in debug.debug_flags:
	803	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	804	return offsets
	805	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	806	total_pages)
	807
	808	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	809	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	810	trace.mutter('expanded: %s', final_offsets)
	811	return final_offsets
	812
	813	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	814	"""Expand requests to neighbors until we have enough pages.
	815
	816	This is called from _expand_offsets after policy has determined that we
	817	want to expand.
	818	We only want to expand requests within a given layer. We cheat a little
	819	bit and assume all requests will be in the same layer. This is true
	820	given the current design, but if it changes this algorithm may perform
	821	oddly.
	822
	823	:param offsets: requested offsets
	824	:param cached_offsets: offsets for pages we currently have cached
	825	:return: A set() of offsets after expansion
	826	"""
	827	final_offsets = set(offsets)
	828	first = end = None
	829	new_tips = set(final_offsets)
	830	while len(final_offsets) < self._recommended_pages and new_tips:
	831	next_tips = set()
	832	for pos in new_tips:
	833	if first is None:
	834	first, end = self._find_layer_first_and_end(pos)
	835	previous = pos - 1
	836	if (previous > 0
	837	and previous not in cached_offsets
	838	and previous not in final_offsets
	839	and previous >= first):
	840	next_tips.add(previous)
	841	after = pos + 1
	842	if (after < total_pages
	843	and after not in cached_offsets
	844	and after not in final_offsets
	845	and after < end):
	846	next_tips.add(after)
	847	# This would keep us from going bigger than
	848	# recommended_pages by only expanding the first offsets.
	849	# However, if we are making a 'wide' request, it is
	850	# reasonable to expand all points equally.
	851	# if len(final_offsets) > recommended_pages:
	852	# break
	853	final_offsets.update(next_tips)
	854	new_tips = next_tips
	855	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	856
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	857	def clear_cache(self):
	858	"""Clear out any cached/memoized values.
	859
	860	This can be called at any time, but generally it is used when we have
	861	extracted some information, but don't expect to be requesting any more
	862	from this index.
	863	"""
	864	# Note that we don't touch self._root_node or self._internal_node_cache
	865	# We don't expect either of those to be big, and it can save
	866	# round-trips in the future. We may re-evaluate this if InternalNode
	867	# memory starts to be an issue.
	868	self._leaf_node_cache.clear()
	869
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	870	def external_references(self, ref_list_num):
	871	if self._root_node is None:
	872	self._get_root_node()
	873	if ref_list_num + 1 > self.node_ref_lists:
	874	raise ValueError('No ref list %d, index has %d ref lists'
	875	% (ref_list_num, self.node_ref_lists))
	876	keys = set()
	877	refs = set()
	878	for node in self.iter_all_entries():
	879	keys.add(node[1])
	880	refs.update(node[3][ref_list_num])
	881	return refs - keys
	882
3763.8.12 by John Arbash Meinel Code cleanup.	883	def _find_layer_first_and_end(self, offset):
	884	"""Find the start/stop nodes for the layer corresponding to offset.
	885
	886	:return: (first, end)
	887	first is the first node in this layer
	888	end is the first node of the next layer
	889	"""
	890	first = end = 0
	891	for roffset in self._row_offsets:
	892	first = end
	893	end = roffset
	894	if offset < roffset:
	895	break
	896	return first, end
	897
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	898	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	899	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	900	cached_offsets = set(self._internal_node_cache.keys())
	901	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	902	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	903	cached_offsets.add(0)
	904	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	905
	906	def _get_root_node(self):
	907	if self._root_node is None:
	908	# We may not have a root node yet
	909	self._get_internal_nodes([0])
	910	return self._root_node
	911
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	912	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	913	found = {}
	914	needed = []
	915	for idx in node_indexes:
	916	if idx == 0 and self._root_node is not None:
	917	found[0] = self._root_node
	918	continue
	919	try:
	920	found[idx] = cache[idx]
	921	except KeyError:
	922	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	923	if not needed:
	924	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	925	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	926	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	927	return found
	928
	929	def _get_internal_nodes(self, node_indexes):
	930	"""Get a node, from cache or disk.
	931
	932	After getting it, the node will be cached.
	933	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	934	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	935
3805.4.6 by John Arbash Meinel refactor for clarity.	936	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	937	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	938	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	939	for node in nodes.itervalues():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	940	for key, value in node.keys.iteritems():
	941	if key in self._leaf_value_cache:
	942	# Don't add the rest of the keys, we've seen this node
	943	# before.
	944	break
	945	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	946
	947	def _get_leaf_nodes(self, node_indexes):
	948	"""Get a bunch of nodes, from cache or disk."""
	949	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	950	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	951	return found
	952
	953	def iter_all_entries(self):
	954	"""Iterate over all keys within the index.
	955
	956	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	957	The former tuple is used when there are no reference lists in the
	958	index, making the API compatible with simple key:value index types.
	959	There is no defined order for the result iteration - it will be in
	960	the most efficient order for the index.
	961	"""
	962	if 'evil' in debug.debug_flags:
	963	trace.mutter_callsite(3,
	964	"iter_all_entries scales with size of history.")
	965	if not self.key_count():
	966	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	967	if self._row_offsets[-1] == 1:
	968	# There is only the root node, and we read that via key_count()
	969	if self.node_ref_lists:
	970	for key, (value, refs) in sorted(self._root_node.keys.items()):
	971	yield (self, key, value, refs)
	972	else:
	973	for key, (value, refs) in sorted(self._root_node.keys.items()):
	974	yield (self, key, value)
	975	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	976	start_of_leaves = self._row_offsets[-2]
	977	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	978	needed_offsets = range(start_of_leaves, end_of_leaves)
	979	if needed_offsets == [0]:
	980	# Special case when we only have a root node, as we have already
	981	# read everything
	982	nodes = [(0, self._root_node)]
	983	else:
	984	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	985	# We iterate strictly in-order so that we can use this function
	986	# for spilling index builds to disk.
	987	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	988	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	989	for key, (value, refs) in sorted(node.keys.items()):
	990	yield (self, key, value, refs)
	991	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	992	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	993	for key, (value, refs) in sorted(node.keys.items()):
	994	yield (self, key, value)
	995
	996	@staticmethod
	997	def _multi_bisect_right(in_keys, fixed_keys):
	998	"""Find the positions where each 'in_key' would fit in fixed_keys.
	999
	1000	This is equivalent to doing "bisect_right" on each in_key into
	1001	fixed_keys
	1002
	1003	:param in_keys: A sorted list of keys to match with fixed_keys
	1004	:param fixed_keys: A sorted list of keys to match against
	1005	:return: A list of (integer position, [key list]) tuples.
	1006	"""
	1007	if not in_keys:
	1008	return []
	1009	if not fixed_keys:
	1010	# no pointers in the fixed_keys list, which means everything must
	1011	# fall to the left.
	1012	return [(0, in_keys)]
	1013
	1014	# TODO: Iterating both lists will generally take M + N steps
	1015	# Bisecting each key will generally take M * log2 N steps.
	1016	# If we had an efficient way to compare, we could pick the method
	1017	# based on which has the fewer number of steps.
	1018	# There is also the argument that bisect_right is a compiled
	1019	# function, so there is even more to be gained.
	1020	# iter_steps = len(in_keys) + len(fixed_keys)
	1021	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1022	if len(in_keys) == 1: # Bisect will always be faster for M = 1
	1023	return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
	1024	# elif bisect_steps < iter_steps:
	1025	# offsets = {}
	1026	# for key in in_keys:
	1027	# offsets.setdefault(bisect_right(fixed_keys, key),
	1028	# []).append(key)
	1029	# return [(o, offsets[o]) for o in sorted(offsets)]
	1030	in_keys_iter = iter(in_keys)
	1031	fixed_keys_iter = enumerate(fixed_keys)
	1032	cur_in_key = in_keys_iter.next()
	1033	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1034
	1035	class InputDone(Exception): pass
	1036	class FixedDone(Exception): pass
	1037
	1038	output = []
	1039	cur_out = []
	1040
	1041	# TODO: Another possibility is that rather than iterating on each side,
	1042	# we could use a combination of bisecting and iterating. For
	1043	# example, while cur_in_key < fixed_key, bisect to find its
	1044	# point, then iterate all matching keys, then bisect (restricted
	1045	# to only the remainder) for the next one, etc.
	1046	try:
	1047	while True:
	1048	if cur_in_key < cur_fixed_key:
	1049	cur_keys = []
	1050	cur_out = (cur_fixed_offset, cur_keys)
	1051	output.append(cur_out)
	1052	while cur_in_key < cur_fixed_key:
	1053	cur_keys.append(cur_in_key)
	1054	try:
	1055	cur_in_key = in_keys_iter.next()
	1056	except StopIteration:
1057	raise InputDone
1058	# At this point cur_in_key must be >= cur_fixed_key
1059	# step the cur_fixed_key until we pass the cur key, or walk off
1060	# the end
1061	while cur_in_key >= cur_fixed_key:
1062	try:
1063	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1064	except StopIteration:
1065	raise FixedDone
1066	except InputDone:
1067	# We consumed all of the input, nothing more to do
1068	pass
1069	except FixedDone:
1070	# There was some input left, but we consumed all of fixed, so we
1071	# have to add one more for the tail
1072	cur_keys = [cur_in_key]
1073	cur_keys.extend(in_keys_iter)
1074	cur_out = (len(fixed_keys), cur_keys)
1075	output.append(cur_out)
1076	return output
1077
4593.4.5 by John Arbash Meinel Start adding some tests.	1078	def _walk_through_internal_nodes(self, keys):
	1079	"""Take the given set of keys, and find the corresponding LeafNodes.
	1080
	1081	:param keys: An unsorted iterable of keys to search for
	1082	:return: (nodes, index_and_keys)
	1083	nodes is a dict mapping {index: LeafNode}
	1084	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1085	"""
	1086	# 6 seconds spent in miss_torture using the sorted() line.
	1087	# Even with out of order disk IO it seems faster not to sort it when
	1088	# large queries are being made.
	1089	keys_at_index = [(0, sorted(keys))]
	1090
	1091	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1092	node_indexes = [idx for idx, s_keys in keys_at_index]
	1093	nodes = self._get_internal_nodes(node_indexes)
	1094
	1095	next_nodes_and_keys = []
	1096	for node_index, sub_keys in keys_at_index:
	1097	node = nodes[node_index]
	1098	positions = self._multi_bisect_right(sub_keys, node.keys)
	1099	node_offset = next_row_start + node.offset
	1100	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1101	for pos, s_keys in positions])
	1102	keys_at_index = next_nodes_and_keys
	1103	# We should now be at the _LeafNodes
	1104	node_indexes = [idx for idx, s_keys in keys_at_index]
	1105
	1106	# TODO: We may not want to always read all the nodes in one
	1107	# big go. Consider setting a max size on this.
	1108	nodes = self._get_leaf_nodes(node_indexes)
	1109	return nodes, keys_at_index
	1110
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1111	def iter_entries(self, keys):
	1112	"""Iterate over keys within the index.
	1113
	1114	:param keys: An iterable providing the keys to be retrieved.
	1115	:return: An iterable as per iter_all_entries, but restricted to the
	1116	keys supplied. No additional keys will be returned, and every
	1117	key supplied that is in the index will be returned.
	1118	"""
	1119	# 6 seconds spent in miss_torture using the sorted() line.
	1120	# Even with out of order disk IO it seems faster not to sort it when
	1121	# large queries are being made.
	1122	# However, now that we are doing multi-way bisecting, we need the keys
	1123	# in sorted order anyway. We could change the multi-way code to not
	1124	# require sorted order. (For example, it bisects for the first node,
	1125	# does an in-order search until a key comes before the current point,
	1126	# which it then bisects for, etc.)
	1127	keys = frozenset(keys)
	1128	if not keys:
	1129	return
	1130
	1131	if not self.key_count():
	1132	return
	1133
	1134	needed_keys = []
	1135	if self._leaf_value_cache is None:
	1136	needed_keys = keys
	1137	else:
	1138	for key in keys:
	1139	value = self._leaf_value_cache.get(key, None)
	1140	if value is not None:
	1141	# This key is known not to be here, skip it
	1142	value, refs = value
	1143	if self.node_ref_lists:
	1144	yield (self, key, value, refs)
	1145	else:
	1146	yield (self, key, value)
	1147	else:
	1148	needed_keys.append(key)
	1149
	1150	last_key = None
	1151	needed_keys = keys
	1152	if not needed_keys:
	1153	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1154	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1155	for node_index, sub_keys in nodes_and_keys:
	1156	if not sub_keys:
	1157	continue
	1158	node = nodes[node_index]
	1159	for next_sub_key in sub_keys:
	1160	if next_sub_key in node.keys:
	1161	value, refs = node.keys[next_sub_key]
	1162	if self.node_ref_lists:
	1163	yield (self, next_sub_key, value, refs)
	1164	else:
	1165	yield (self, next_sub_key, value)
	1166
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1167	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1168	"""Find the parent_map information for the set of keys.
	1169
	1170	This populates the parent_map dict and missing_keys set based on the
	1171	queried keys. It also can fill out an arbitrary number of parents that
	1172	it finds while searching for the supplied keys.
	1173
	1174	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1175	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1176
	1177	:param keys: A keys whose ancestry we want to return
	1178	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1179	:param ref_list_num: This index in the ref_lists is the parents we
	1180	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1181	:param parent_map: {key: parent_keys} for keys that are present in this
	1182	index. This may contain more entries than were in 'keys', that are
	1183	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1184	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1185	This may include parents that were not directly requested, but we
	1186	were able to determine that they are not present in this index.
	1187	:return: search_keys parents that were found but not queried to know
	1188	if they are missing or present. Callers can re-query this index for
	1189	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1190	"""
	1191	if not self.key_count():
	1192	# We use key_count() to trigger reading the root node and
	1193	# determining info about this BTreeGraphIndex
	1194	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1195	missing_keys.update(keys)
	1196	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1197	if ref_list_num >= self.node_ref_lists:
	1198	raise ValueError('No ref list %d, index has %d ref lists'
	1199	% (ref_list_num, self.node_ref_lists))
	1200
	1201	# The main trick we are trying to accomplish is that when we find a
	1202	# key listing its parents, we expect that the parent key is also likely
	1203	# to sit on the same page. Allowing us to expand parents quickly
	1204	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1205	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1206
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1207	# These are parent keys which could not be immediately resolved on the
	1208	# page where the child was present. Note that we may already be
	1209	# searching for that key, and it may actually be present [or known
	1210	# missing] on one of the other pages we are reading.
	1211	# TODO:
	1212	# We could try searching for them in the immediate previous or next
	1213	# page. If they occur "later" we could put them in a pending lookup
	1214	# set, and then for each node we read thereafter we could check to
	1215	# see if they are present.
	1216	# However, we don't know the impact of keeping this list of things
	1217	# that I'm going to search for every node I come across from here on
	1218	# out.
	1219	# It doesn't handle the case when the parent key is missing on a
	1220	# page that we don't read. So we already have to handle being
	1221	# re-entrant for that.
	1222	# Since most keys contain a date string, they are more likely to be
	1223	# found earlier in the file than later, but we would know that right
	1224	# away (key < min_key), and wouldn't keep searching it on every other
	1225	# page that we read.
	1226	# Mostly, it is an idea, one which should be benchmarked.
	1227	parents_not_on_page = set()
	1228
	1229	for node_index, sub_keys in nodes_and_keys:
	1230	if not sub_keys:
	1231	continue
	1232	# sub_keys is all of the keys we are looking for that should exist
	1233	# on this page, if they aren't here, then they won't be found
	1234	node = nodes[node_index]
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1235	node_keys = node.keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1236	parents_to_check = set()
	1237	for next_sub_key in sub_keys:
4593.4.5 by John Arbash Meinel Start adding some tests.	1238	if next_sub_key not in node_keys:
	1239	# This one is just not present in the index at all
	1240	missing_keys.add(next_sub_key)
	1241	else:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1242	value, refs = node_keys[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1243	parent_keys = refs[ref_list_num]
	1244	parent_map[next_sub_key] = parent_keys
	1245	parents_to_check.update(parent_keys)
	1246	# Don't look for things we've already found
	1247	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1248	# this can be used to test the benefit of having the check loop
	1249	# inlined.
	1250	# parents_not_on_page.update(parents_to_check)
	1251	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1252	while parents_to_check:
	1253	next_parents_to_check = set()
	1254	for key in parents_to_check:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1255	if key in node_keys:
	1256	value, refs = node_keys[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1257	parent_keys = refs[ref_list_num]
	1258	parent_map[key] = parent_keys
	1259	next_parents_to_check.update(parent_keys)
	1260	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1261	# This parent either is genuinely missing, or should be
	1262	# found on another page. Perf test whether it is better
	1263	# to check if this node should fit on this page or not.
	1264	# in the 'everything-in-one-pack' scenario, this not
	1265	# doing the check is 237ms vs 243ms.
	1266	# So slightly better, but I assume the standard 'lots
	1267	# of packs' is going to show a reasonable improvement
	1268	# from the check, because it avoids 'going around
	1269	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1270	# parents_not_on_page.add(key)
	1271	# Missing for some reason
	1272	if key < node.min_key:
	1273	# in the case of bzr.dev, 3.4k/5.3k misses are
	1274	# 'earlier' misses (65%)
	1275	parents_not_on_page.add(key)
	1276	elif key > node.max_key:
	1277	# This parent key would be present on a different
	1278	# LeafNode
	1279	parents_not_on_page.add(key)
	1280	else:
	1281	# assert key != node.min_key and key != node.max_key
	1282	# If it was going to be present, it would be on
	1283	# this page, so mark it missing.
	1284	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1285	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1286	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1287	# parents_not_on_page could have been found on a different page, or be
	1288	# known to be missing. So cull out everything that has already been
	1289	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1290	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1291	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1292	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1293
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1294	def iter_entries_prefix(self, keys):
	1295	"""Iterate over keys within the index using prefix matching.
	1296
	1297	Prefix matching is applied within the tuple of a key, not to within
	1298	the bytestring of each key element. e.g. if you have the keys ('foo',
	1299	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1300	only the former key is returned.
	1301
	1302	WARNING: Note that this method currently causes a full index parse
	1303	unconditionally (which is reasonably appropriate as it is a means for
	1304	thunking many small indices into one larger one and still supplies
	1305	iter_all_entries at the thunk layer).
	1306
	1307	:param keys: An iterable providing the key prefixes to be retrieved.
	1308	Each key prefix takes the form of a tuple the length of a key, but
	1309	with the last N elements 'None' rather than a regular bytestring.
	1310	The first element cannot be 'None'.
	1311	:return: An iterable as per iter_all_entries, but restricted to the
	1312	keys with a matching prefix to those supplied. No additional keys
	1313	will be returned, and every match that is in the index will be
	1314	returned.
	1315	"""
	1316	keys = sorted(set(keys))
	1317	if not keys:
	1318	return
	1319	# Load if needed to check key lengths
	1320	if self._key_count is None:
	1321	self._get_root_node()
	1322	# TODO: only access nodes that can satisfy the prefixes we are looking
	1323	# for. For now, to meet API usage (as this function is not used by
	1324	# current bzrlib) just suck the entire index and iterate in memory.
	1325	nodes = {}
	1326	if self.node_ref_lists:
	1327	if self._key_length == 1:
	1328	for _1, key, value, refs in self.iter_all_entries():
	1329	nodes[key] = value, refs
	1330	else:
	1331	nodes_by_key = {}
	1332	for _1, key, value, refs in self.iter_all_entries():
	1333	key_value = key, value, refs
	1334	# For a key of (foo, bar, baz) create
	1335	# _nodes_by_key[foo][bar][baz] = key_value
	1336	key_dict = nodes_by_key
	1337	for subkey in key[:-1]:
	1338	key_dict = key_dict.setdefault(subkey, {})
	1339	key_dict[key[-1]] = key_value
	1340	else:
	1341	if self._key_length == 1:
	1342	for _1, key, value in self.iter_all_entries():
	1343	nodes[key] = value
	1344	else:
	1345	nodes_by_key = {}
	1346	for _1, key, value in self.iter_all_entries():
	1347	key_value = key, value
	1348	# For a key of (foo, bar, baz) create
	1349	# _nodes_by_key[foo][bar][baz] = key_value
	1350	key_dict = nodes_by_key
	1351	for subkey in key[:-1]:
	1352	key_dict = key_dict.setdefault(subkey, {})
	1353	key_dict[key[-1]] = key_value
	1354	if self._key_length == 1:
	1355	for key in keys:
	1356	# sanity check
	1357	if key[0] is None:
1358	raise errors.BadIndexKey(key)
1359	if len(key) != self._key_length:
1360	raise errors.BadIndexKey(key)
1361	try:
1362	if self.node_ref_lists:
1363	value, node_refs = nodes[key]
1364	yield self, key, value, node_refs
1365	else:
1366	yield self, key, nodes[key]
1367	except KeyError:
1368	pass
1369	return
1370	for key in keys:
1371	# sanity check
1372	if key[0] is None:
1373	raise errors.BadIndexKey(key)
1374	if len(key) != self._key_length:
1375	raise errors.BadIndexKey(key)
1376	# find what it refers to:
1377	key_dict = nodes_by_key
1378	elements = list(key)
1379	# find the subdict whose contents should be returned.
1380	try:
1381	while len(elements) and elements[0] is not None:
1382	key_dict = key_dict[elements[0]]
1383	elements.pop(0)
1384	except KeyError:
1385	# a non-existant lookup.
1386	continue
1387	if len(elements):
1388	dicts = [key_dict]
1389	while dicts:
1390	key_dict = dicts.pop(-1)
1391	# can't be empty or would not exist
1392	item, value = key_dict.iteritems().next()
1393	if type(value) == dict:
1394	# push keys
1395	dicts.extend(key_dict.itervalues())
1396	else:
1397	# yield keys
1398	for value in key_dict.itervalues():
1399	# each value is the key:value:node refs tuple
1400	# ready to yield.
1401	yield (self, ) + value
1402	else:
1403	# the last thing looked up was a terminal element
1404	yield (self, ) + key_dict
1405
1406	def key_count(self):
1407	"""Return an estimate of the number of keys in this index.
1408
1409	For BTreeGraphIndex the estimate is exact as it is contained in the
1410	header.
1411	"""
1412	if self._key_count is None:
1413	self._get_root_node()
1414	return self._key_count
1415
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1416	def _compute_row_offsets(self):
	1417	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1418	offsets = []
	1419	row_offset = 0
	1420	for row in self._row_lengths:
	1421	offsets.append(row_offset)
	1422	row_offset += row
	1423	offsets.append(row_offset)
	1424	self._row_offsets = offsets
	1425
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1426	def _parse_header_from_bytes(self, bytes):
	1427	"""Parse the header from a region of bytes.
	1428
	1429	:param bytes: The data to parse.
	1430	:return: An offset, data tuple such as readv yields, for the unparsed
	1431	data. (which may be of length 0).
	1432	"""
	1433	signature = bytes[0:len(self._signature())]
	1434	if not signature == self._signature():
	1435	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1436	lines = bytes[len(self._signature()):].splitlines()
	1437	options_line = lines[0]
	1438	if not options_line.startswith(_OPTION_NODE_REFS):
	1439	raise errors.BadIndexOptions(self)
	1440	try:
	1441	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1442	except ValueError:
	1443	raise errors.BadIndexOptions(self)
	1444	options_line = lines[1]
	1445	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1446	raise errors.BadIndexOptions(self)
	1447	try:
	1448	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1449	except ValueError:
	1450	raise errors.BadIndexOptions(self)
	1451	options_line = lines[2]
	1452	if not options_line.startswith(_OPTION_LEN):
	1453	raise errors.BadIndexOptions(self)
	1454	try:
	1455	self._key_count = int(options_line[len(_OPTION_LEN):])
	1456	except ValueError:
	1457	raise errors.BadIndexOptions(self)
	1458	options_line = lines[3]
	1459	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1460	raise errors.BadIndexOptions(self)
	1461	try:
	1462	self._row_lengths = map(int, [length for length in
	1463	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1464	if len(length)])
	1465	except ValueError:
	1466	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1467	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1468
	1469	# calculate the bytes we have processed
	1470	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1471	return header_end, bytes[header_end:]
	1472
	1473	def _read_nodes(self, nodes):
	1474	"""Read some nodes from disk into the LRU cache.
	1475
	1476	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1477	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1478	supplied order. If possible doing sort() on the list before requesting
	1479	a read may improve performance.
	1480
	1481	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1482	:return: None
	1483	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1484	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1485	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1486	# list of (offset, length) regions of the file that should, evenually
	1487	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1488	ranges = []
	1489	for index in nodes:
	1490	offset = index * _PAGE_SIZE
	1491	size = _PAGE_SIZE
	1492	if index == 0:
	1493	# Root node - special case
	1494	if self._size:
	1495	size = min(_PAGE_SIZE, self._size)
	1496	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1497	# The only case where we don't know the size, is for very
	1498	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1499	bytes = self._transport.get_bytes(self._name)
	1500	self._size = len(bytes)
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1501	# the whole thing should be parsed out of 'bytes'
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1502	ranges.append((0, len(bytes)))
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1503	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1504	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1505	if offset > self._size:
	1506	raise AssertionError('tried to read past the end'
	1507	' of the file %s > %s'
	1508	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1509	size = min(size, self._size - offset)
	1510	ranges.append((offset, size))
	1511	if not ranges:
	1512	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1513	elif bytes is not None:
	1514	# already have the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1515	data_ranges = [(start, bytes[start:start+_PAGE_SIZE])
	1516	for start in xrange(0, len(bytes), _PAGE_SIZE)]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1517	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1518	data_ranges = self._transport.readv(self._name, ranges)
	1519	else:
	1520	data_ranges = []
	1521	for offset, size in ranges:
	1522	self._file.seek(offset)
	1523	data_ranges.append((offset, self._file.read(size)))
	1524	for offset, data in data_ranges:
	1525	if offset == 0:
	1526	# extract the header
	1527	offset, data = self._parse_header_from_bytes(data)
	1528	if len(data) == 0:
	1529	continue
	1530	bytes = zlib.decompress(data)
	1531	if bytes.startswith(_LEAF_FLAG):
	1532	node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
	1533	elif bytes.startswith(_INTERNAL_FLAG):
	1534	node = _InternalNode(bytes)
	1535	else:
	1536	raise AssertionError("Unknown node type for %r" % bytes)
	1537	yield offset / _PAGE_SIZE, node
	1538
	1539	def _signature(self):
	1540	"""The file signature for this index type."""
	1541	return _BTSIGNATURE
	1542
	1543	def validate(self):
	1544	"""Validate that everything in the index can be accessed."""
	1545	# just read and parse every node.
	1546	self._get_root_node()
	1547	if len(self._row_lengths) > 1:
	1548	start_node = self._row_offsets[1]
	1549	else:
	1550	# We shouldn't be reading anything anyway
	1551	start_node = 1
	1552	node_end = self._row_offsets[-1]
	1553	for node in self._read_nodes(range(start_node, node_end)):
	1554	pass
	1555
	1556
	1557	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1558	from bzrlib import _btree_serializer_pyx as _btree_serializer
4574.3.6 by Martin Pool More warnings when failing to load extensions	1559	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1560	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1561	from bzrlib import _btree_serializer_py as _btree_serializer