~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 6169)

5752.3.8 by John Arbash Meinel Merge bzr.dev 5764 to resolve release-notes (aka NEWS) conflicts	1	# Copyright (C) 2008-2011 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	21
	22	from bzrlib.lazy_import import lazy_import
	23	lazy_import(globals(), """
5753.2.4 by Jelmer Vernooij Review feedback from John.	24	import bisect
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	25	import math
	26	import tempfile
	27	import zlib
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	28	""")
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	29
	30	from bzrlib import (
	31	chunk_writer,
	32	debug,
	33	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	34	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	35	index,
	36	lru_cache,
	37	osutils,
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	38	static_tuple,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	39	trace,
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	40	transport,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	)
	42	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	43
	44
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	45	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	46	_OPTION_ROW_LENGTHS = "row_lengths="
	47	_LEAF_FLAG = "type=leaf\n"
	48	_INTERNAL_FLAG = "type=internal\n"
	49	_INTERNAL_OFFSET = "offset="
	50
	51	_RESERVED_HEADER_BYTES = 120
	52	_PAGE_SIZE = 4096
	53
	54	# 4K per page: 4MB - 1000 entries
	55	_NODE_CACHE_SIZE = 1000
	56
	57
	58	class _BuilderRow(object):
	59	"""The stored state accumulated while writing out a row in the index.
	60
	61	:ivar spool: A temporary file used to accumulate nodes for this row
	62	in the tree.
	63	:ivar nodes: The count of nodes emitted so far.
	64	"""
	65
	66	def __init__(self):
	67	"""Create a _BuilderRow."""
	68	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	69	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	70	self.writer = None
	71
	72	def finish_node(self, pad=True):
	73	byte_lines, _, padding = self.writer.finish()
	74	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	75	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	76	# padded note:
	77	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	78	elif self.nodes == 1:
	79	# We got bigger than 1 node, switch to a temp file
	80	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	81	spool.write(self.spool.getvalue())
	82	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	83	skipped_bytes = 0
	84	if not pad and padding:
	85	del byte_lines[-1]
	86	skipped_bytes = padding
	87	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	88	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	89	if remainder != 0:
	90	raise AssertionError("incorrect node length: %d, %d"
	91	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	92	self.nodes += 1
	93	self.writer = None
	94
	95
	96	class _InternalBuilderRow(_BuilderRow):
	97	"""The stored state accumulated while writing out internal rows."""
	98
	99	def finish_node(self, pad=True):
	100	if not pad:
	101	raise AssertionError("Must pad internal nodes only.")
	102	_BuilderRow.finish_node(self)
	103
	104
	105	class _LeafBuilderRow(_BuilderRow):
	106	"""The stored state accumulated while writing out a leaf rows."""
	107
	108
	109	class BTreeBuilder(index.GraphIndexBuilder):
	110	"""A Builder for B+Tree based Graph indices.
	111
	112	The resulting graph has the structure:
	113
	114	_SIGNATURE OPTIONS NODES
	115	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	116	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	117	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	118	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	119	LENGTH := 'len=' DIGITS NEWLINE
	120	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	121	NODES := NODE_COMPRESSED*
	122	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	123	NODE_RAW := INTERNAL \| LEAF
	124	INTERNAL := INTERNAL_FLAG POINTERS
	125	LEAF := LEAF_FLAG ROWS
	126	KEY_ELEMENT := Not-whitespace-utf8
	127	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	128	ROWS := ROW*
	129	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	130	ABSENT := 'a'
	131	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	132	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	133	REFERENCE := KEY
	134	VALUE := no-newline-no-null-bytes
	135	"""
	136
	137	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	138	"""See GraphIndexBuilder.__init__.
	139
	140	:param spill_at: Optional parameter controlling the maximum number
	141	of nodes that BTreeBuilder will hold in memory.
	142	"""
	143	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	144	key_elements=key_elements)
	145	self._spill_at = spill_at
	146	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	147	# A map of {key: (node_refs, value)}
	148	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	149	# Indicate it hasn't been built yet
	150	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	151	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	152
	153	def add_node(self, key, value, references=()):
	154	"""Add a node to the index.
	155
	156	If adding the node causes the builder to reach its spill_at threshold,
	157	disk spilling will be triggered.
	158
	159	:param key: The key. keys are non-empty tuples containing
	160	as many whitespace-free utf8 bytestrings as the key length
	161	defined for this index.
	162	:param references: An iterable of iterables of keys. Each is a
	163	reference to another key.
	164	:param value: The value to associate with the key. It may be any
5891.1.3 by Andrew Bennetts Move docstring formatting fixes.	165	bytes as long as it does not contain \\0 or \\n.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	166	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	167	# Ensure that 'key' is a StaticTuple
	168	key = static_tuple.StaticTuple.from_sequence(key).intern()
3644.2.9 by John Arbash Meinel Refactor some code.	169	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	170	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	171	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	172	raise errors.BadIndexDuplicateKey(key, self)
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	173	self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
3644.2.9 by John Arbash Meinel Refactor some code.	174	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	175	self._update_nodes_by_key(key, value, node_refs)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	176	if len(self._nodes) < self._spill_at:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	177	return
3644.2.9 by John Arbash Meinel Refactor some code.	178	self._spill_mem_keys_to_disk()
	179
	180	def _spill_mem_keys_to_disk(self):
	181	"""Write the in memory keys down to disk to cap memory consumption.
	182
	183	If we already have some keys written to disk, we will combine them so
	184	as to preserve the sorted order. The algorithm for combining uses
	185	powers of two. So on the first spill, write all mem nodes into a
	186	single index. On the second spill, combine the mem nodes with the nodes
	187	on disk to create a 2x sized disk index and get rid of the first index.
	188	On the third spill, create a single new disk index, which will contain
	189	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	190	combine mem with the first and second indexes, creating a new one of
	191	size 4x. On the fifth create a single new one, etc.
	192	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	193	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	194	(new_backing_file, size,
	195	backing_pos) = self._spill_mem_keys_and_combine()
	196	else:
	197	new_backing_file, size = self._spill_mem_keys_without_combining()
	198	# Note: The transport here isn't strictly needed, because we will use
	199	# direct access to the new_backing._file object
6083.1.1 by Jelmer Vernooij Use get_transport_from_{url,path} in more places.	200	new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	201	'<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	202	# GC will clean up the file
	203	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	204	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	205	if len(self._backing_indices) == backing_pos:
	206	self._backing_indices.append(None)
	207	self._backing_indices[backing_pos] = new_backing
	208	for backing_pos in range(backing_pos):
	209	self._backing_indices[backing_pos] = None
	210	else:
	211	self._backing_indices.append(new_backing)
	212	self._nodes = {}
	213	self._nodes_by_key = None
	214
	215	def _spill_mem_keys_without_combining(self):
	216	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	217
	218	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	219	iterators_to_combine = [self._iter_mem_nodes()]
	220	pos = -1
	221	for pos, backing in enumerate(self._backing_indices):
	222	if backing is None:
	223	pos -= 1
	224	break
	225	iterators_to_combine.append(backing.iter_all_entries())
	226	backing_pos = pos + 1
	227	new_backing_file, size = \
	228	self._write_nodes(self._iter_smallest(iterators_to_combine),
	229	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	230	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	231
	232	def add_nodes(self, nodes):
	233	"""Add nodes to the index.
	234
	235	:param nodes: An iterable of (key, node_refs, value) entries to add.
	236	"""
	237	if self.reference_lists:
	238	for (key, value, node_refs) in nodes:
	239	self.add_node(key, value, node_refs)
	240	else:
	241	for (key, value) in nodes:
	242	self.add_node(key, value)
	243
	244	def _iter_mem_nodes(self):
	245	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	246	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	247	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	248	for key in sorted(nodes):
	249	references, value = nodes[key]
	250	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	251	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	252	for key in sorted(nodes):
	253	references, value = nodes[key]
	254	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	255
	256	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	257	if len(iterators_to_combine) == 1:
	258	for value in iterators_to_combine[0]:
	259	yield value
	260	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	261	current_values = []
	262	for iterator in iterators_to_combine:
	263	try:
	264	current_values.append(iterator.next())
	265	except StopIteration:
	266	current_values.append(None)
	267	last = None
	268	while True:
	269	# Decorate candidates with the value to allow 2.4's min to be used.
	270	candidates = [(item[1][1], item) for item
	271	in enumerate(current_values) if item[1] is not None]
	272	if not len(candidates):
	273	return
	274	selected = min(candidates)
	275	# undecorate back to (pos, node)
	276	selected = selected[1]
	277	if last == selected[1][1]:
	278	raise errors.BadIndexDuplicateKey(last, self)
	279	last = selected[1][1]
	280	# Yield, with self as the index
	281	yield (self,) + selected[1][1:]
	282	pos = selected[0]
	283	try:
	284	current_values[pos] = iterators_to_combine[pos].next()
	285	except StopIteration:
	286	current_values[pos] = None
	287
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	288	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	289	"""Add a key to the current chunk.
	290
	291	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	292	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	293	:param allow_optimize: If set to False, prevent setting the optimize
	294	flag when writing out. This is used by the _spill_mem_keys_to_disk
	295	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	296	"""
	297	if rows[-1].writer is None:
	298	# opening a new leaf chunk;
	299	for pos, internal_row in enumerate(rows[:-1]):
	300	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	301	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	302	if internal_row.writer is None:
	303	length = _PAGE_SIZE
	304	if internal_row.nodes == 0:
	305	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	306	if allow_optimize:
	307	optimize_for_size = self._optimize_for_size
	308	else:
	309	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	310	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	311	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	312	internal_row.writer.write(_INTERNAL_FLAG)
	313	internal_row.writer.write(_INTERNAL_OFFSET +
	314	str(rows[pos + 1].nodes) + "\n")
	315	# add a new leaf
	316	length = _PAGE_SIZE
	317	if rows[-1].nodes == 0:
	318	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	319	rows[-1].writer = chunk_writer.ChunkWriter(length,
	320	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	321	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	322	if rows[-1].writer.write(line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	323	# this key did not fit in the node:
	324	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	325	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	326	new_row = True
	327	for row in reversed(rows[:-1]):
	328	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	329	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	330	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	331	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	332	row.finish_node()
	333	else:
	334	# We've found a node that can handle the pointer.
	335	new_row = False
	336	break
	337	# If we reached the current root without being able to mark the
	338	# division point, then we need a new root:
	339	if new_row:
	340	# We need a new row
	341	if 'index' in debug.debug_flags:
	342	trace.mutter('Inserting new global row.')
	343	new_row = _InternalBuilderRow()
	344	reserved_bytes = 0
	345	rows.insert(0, new_row)
	346	# This will be padded, hence the -100
	347	new_row.writer = chunk_writer.ChunkWriter(
	348	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	349	reserved_bytes,
	350	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	351	new_row.writer.write(_INTERNAL_FLAG)
	352	new_row.writer.write(_INTERNAL_OFFSET +
	353	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	354	new_row.writer.write(key_line)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	355	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	356
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	357	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	358	"""Write node_iterator out as a B+Tree.
	359
	360	:param node_iterator: An iterator of sorted nodes. Each node should
	361	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	362	:param allow_optimize: If set to False, prevent setting the optimize
	363	flag when writing out. This is used by the _spill_mem_keys_to_disk
	364	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	365	:return: A file handle for a temporary file containing a B+Tree for
	366	the nodes.
	367	"""
	368	# The index rows - rows[0] is the root, rows[1] is the layer under it
	369	# etc.
	370	rows = []
	371	# forward sorted by key. In future we may consider topological sorting,
	372	# at the cost of table scans for direct lookup, or a second index for
	373	# direct lookup
	374	key_count = 0
	375	# A stack with the number of nodes of each size. 0 is the root node
	376	# and must always be 1 (if there are any nodes in the tree).
	377	self.row_lengths = []
	378	# Loop over all nodes adding them to the bottom row
	379	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	380	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	381	# row above, transitively.
	382	for node in node_iterator:
	383	if key_count == 0:
	384	# First key triggers the first row
	385	rows.append(_LeafBuilderRow())
	386	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	387	string_key, line = _btree_serializer._flatten_node(node,
	388	self.reference_lists)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	389	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	390	for row in reversed(rows):
	391	pad = (type(row) != _LeafBuilderRow)
	392	row.finish_node(pad=pad)
	393	lines = [_BTSIGNATURE]
	394	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	395	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	396	lines.append(_OPTION_LEN + str(key_count) + '\n')
	397	row_lengths = [row.nodes for row in rows]
	398	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	399	if row_lengths and row_lengths[-1] > 1:
	400	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	401	else:
	402	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	403	result.writelines(lines)
	404	position = sum(map(len, lines))
	405	root_row = True
	406	if position > _RESERVED_HEADER_BYTES:
	407	raise AssertionError("Could not fit the header in the"
	408	" reserved space: %d > %d"
	409	% (position, _RESERVED_HEADER_BYTES))
	410	# write the rows out:
	411	for row in rows:
	412	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	413	row.spool.flush()
	414	row.spool.seek(0)
	415	# copy nodes to the finalised file.
	416	# Special case the first node as it may be prefixed
	417	node = row.spool.read(_PAGE_SIZE)
	418	result.write(node[reserved:])
4771.3.1 by John Arbash Meinel We don't have to pad 'short' records.	419	if len(node) == _PAGE_SIZE:
	420	result.write("\x00" * (reserved - position))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	421	position = 0 # Only the root row actually has an offset
	422	copied_len = osutils.pumpfile(row.spool, result)
	423	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	424	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	425	raise AssertionError("Incorrect amount of data copied"
	426	" expected: %d, got: %d"
	427	% ((row.nodes - 1) * _PAGE_SIZE,
	428	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	429	result.flush()
	430	size = result.tell()
	431	result.seek(0)
	432	return result, size
	433
	434	def finish(self):
	435	"""Finalise the index.
	436
	437	:return: A file handle for a temporary file containing the nodes added
	438	to the index.
	439	"""
	440	return self._write_nodes(self.iter_all_entries())[0]
	441
	442	def iter_all_entries(self):
	443	"""Iterate over all keys within the index
	444
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	445	:return: An iterable of (index, key, value, reference_lists). There is
	446	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	447	efficient order for the index (in this case dictionary hash order).
	448	"""
	449	if 'evil' in debug.debug_flags:
	450	trace.mutter_callsite(3,
	451	"iter_all_entries scales with size of history.")
	452	# Doing serial rather than ordered would be faster; but this shouldn't
	453	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	454	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	455	for backing in self._backing_indices:
	456	if backing is not None:
	457	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	458	if len(iterators) == 1:
	459	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	460	return self._iter_smallest(iterators)
	461
	462	def iter_entries(self, keys):
	463	"""Iterate over keys within the index.
	464
	465	:param keys: An iterable providing the keys to be retrieved.
	466	:return: An iterable of (index, key, value, reference_lists). There is no
	467	defined order for the result iteration - it will be in the most
	468	efficient order for the index (keys iteration order in this case).
	469	"""
	470	keys = set(keys)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	471	# Note: We don't use keys.intersection() here. If you read the C api,
	472	# set.intersection(other) special cases when other is a set and
	473	# will iterate the smaller of the two and lookup in the other.
	474	# It does not do this for any other type (even dict, unlike
	475	# some other set functions.) Since we expect keys is generally <<
	476	# self._nodes, it is faster to iterate over it in a list
	477	# comprehension
	478	nodes = self._nodes
	479	local_keys = [key for key in keys if key in nodes]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	480	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	481	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	482	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	483	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	484	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	485	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	486	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	487	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	488	# Find things that are in backing indices that have not been handled
	489	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	490	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	491	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	492	# Remove all of the keys that we found locally
	493	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	494	for backing in self._backing_indices:
	495	if backing is None:
	496	continue
	497	if not keys:
	498	return
	499	for node in backing.iter_entries(keys):
	500	keys.remove(node[1])
	501	yield (self,) + node[1:]
	502
	503	def iter_entries_prefix(self, keys):
	504	"""Iterate over keys within the index using prefix matching.
	505
	506	Prefix matching is applied within the tuple of a key, not to within
	507	the bytestring of each key element. e.g. if you have the keys ('foo',
	508	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	509	only the former key is returned.
	510
	511	:param keys: An iterable providing the key prefixes to be retrieved.
	512	Each key prefix takes the form of a tuple the length of a key, but
	513	with the last N elements 'None' rather than a regular bytestring.
	514	The first element cannot be 'None'.
	515	:return: An iterable as per iter_all_entries, but restricted to the
	516	keys with a matching prefix to those supplied. No additional keys
	517	will be returned, and every match that is in the index will be
	518	returned.
	519	"""
	520	# XXX: To much duplication with the GraphIndex class; consider finding
	521	# a good place to pull out the actual common logic.
	522	keys = set(keys)
	523	if not keys:
	524	return
	525	for backing in self._backing_indices:
	526	if backing is None:
	527	continue
	528	for node in backing.iter_entries_prefix(keys):
	529	yield (self,) + node[1:]
	530	if self._key_length == 1:
	531	for key in keys:
	532	# sanity check
	533	if key[0] is None:
	534	raise errors.BadIndexKey(key)
	535	if len(key) != self._key_length:
	536	raise errors.BadIndexKey(key)
	537	try:
	538	node = self._nodes[key]
	539	except KeyError:
	540	continue
	541	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	542	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	543	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	544	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	545	return
	546	for key in keys:
	547	# sanity check
	548	if key[0] is None:
	549	raise errors.BadIndexKey(key)
	550	if len(key) != self._key_length:
	551	raise errors.BadIndexKey(key)
	552	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	553	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	554	elements = list(key)
	555	# find the subdict to return
	556	try:
	557	while len(elements) and elements[0] is not None:
	558	key_dict = key_dict[elements[0]]
	559	elements.pop(0)
	560	except KeyError:
	561	# a non-existant lookup.
	562	continue
	563	if len(elements):
	564	dicts = [key_dict]
	565	while dicts:
	566	key_dict = dicts.pop(-1)
	567	# can't be empty or would not exist
	568	item, value = key_dict.iteritems().next()
	569	if type(value) == dict:
	570	# push keys
	571	dicts.extend(key_dict.itervalues())
	572	else:
	573	# yield keys
	574	for value in key_dict.itervalues():
5088.1.1 by Jelmer Vernooij Force value to a tuple before concatenating with tuple that contains variables	575	yield (self, ) + tuple(value)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	576	else:
	577	yield (self, ) + key_dict
	578
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	579	def _get_nodes_by_key(self):
	580	if self._nodes_by_key is None:
	581	nodes_by_key = {}
	582	if self.reference_lists:
	583	for key, (references, value) in self._nodes.iteritems():
	584	key_dict = nodes_by_key
	585	for subkey in key[:-1]:
	586	key_dict = key_dict.setdefault(subkey, {})
	587	key_dict[key[-1]] = key, value, references
	588	else:
	589	for key, (references, value) in self._nodes.iteritems():
	590	key_dict = nodes_by_key
	591	for subkey in key[:-1]:
	592	key_dict = key_dict.setdefault(subkey, {})
	593	key_dict[key[-1]] = key, value
	594	self._nodes_by_key = nodes_by_key
	595	return self._nodes_by_key
	596
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	597	def key_count(self):
	598	"""Return an estimate of the number of keys in this index.
	599
	600	For InMemoryGraphIndex the estimate is exact.
	601	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	602	return len(self._nodes) + sum(backing.key_count() for backing in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	603	self._backing_indices if backing is not None)
	604
	605	def validate(self):
	606	"""In memory index's have no known corruption at the moment."""
	607
	608
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	609	class _LeafNode(dict):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	610	"""A leaf node for a serialised B+Tree index."""
	611
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	612	__slots__ = ('min_key', 'max_key', '_keys')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	613
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	614	def __init__(self, bytes, key_length, ref_list_length):
	615	"""Parse bytes to create a leaf node object."""
	616	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	617	key_list = _btree_serializer._parse_leaf_lines(bytes,
	618	key_length, ref_list_length)
	619	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	620	self.min_key = key_list[0][0]
	621	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	622	else:
	623	self.min_key = self.max_key = None
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	624	super(_LeafNode, self).__init__(key_list)
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	625	self._keys = dict(self)
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	626
	627	def all_items(self):
	628	"""Return a sorted list of (key, (value, refs)) items"""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	629	items = self.items()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	630	items.sort()
	631	return items
	632
	633	def all_keys(self):
	634	"""Return a sorted list of all keys."""
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	635	keys = self.keys()
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	636	keys.sort()
	637	return keys
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	638
	639
	640	class _InternalNode(object):
	641	"""An internal node for a serialised B+Tree index."""
	642
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	643	__slots__ = ('keys', 'offset')
	644
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	645	def __init__(self, bytes):
	646	"""Parse bytes to create an internal node object."""
	647	# splitlines mangles the \r delimiters.. don't use it.
	648	self.keys = self._parse_lines(bytes.split('\n'))
	649
	650	def _parse_lines(self, lines):
	651	nodes = []
	652	self.offset = int(lines[1][7:])
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	653	as_st = static_tuple.StaticTuple.from_sequence
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	654	for line in lines[2:]:
	655	if line == '':
	656	break
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	657	nodes.append(as_st(map(intern, line.split('\0'))).intern())
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	658	return nodes
	659
	660
	661	class BTreeGraphIndex(object):
	662	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	663
	664	Individual nodes are held in a LRU cache. This holds the root node in
	665	memory except when very large walks are done.
	666	"""
	667
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	668	def __init__(self, transport, name, size, unlimited_cache=False,
	669	offset=0):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	670	"""Create a B+Tree index object on the index name.
	671
	672	:param transport: The transport to read data for the index from.
	673	:param name: The file name of the index on transport.
	674	:param size: Optional size of the index in bytes. This allows
	675	compatibility with the GraphIndex API, as well as ensuring that
	676	the initial read (to read the root node header) can be done
	677	without over-reading even on empty indices, and on small indices
	678	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	679	:param unlimited_cache: If set to True, then instead of using an
	680	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	681	cache all leaf nodes.
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	682	:param offset: The start of the btree index data isn't byte 0 of the
	683	file. Instead it starts at some point later.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	684	"""
	685	self._transport = transport
	686	self._name = name
	687	self._size = size
	688	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	689	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	690	self._root_node = None
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	691	self._base_offset = offset
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	692	self._leaf_factory = _LeafNode
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	693	# Default max size is 100,000 leave values
	694	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	695	if unlimited_cache:
	696	self._leaf_node_cache = {}
	697	self._internal_node_cache = {}
	698	else:
	699	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	700	# We use a FIFO here just to prevent possible blowout. However, a
	701	# 300k record btree has only 3k leaf nodes, and only 20 internal
	702	# nodes. A value of 100 scales to ~100100100 = 1M records.
	703	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	704	self._key_count = None
	705	self._row_lengths = None
	706	self._row_offsets = None # Start of each row, [-1] is the end
	707
	708	def __eq__(self, other):
	709	"""Equal when self and other were created with the same parameters."""
	710	return (
	711	type(self) == type(other) and
	712	self._transport == other._transport and
	713	self._name == other._name and
	714	self._size == other._size)
	715
	716	def __ne__(self, other):
	717	return not self.__eq__(other)
	718
3763.8.12 by John Arbash Meinel Code cleanup.	719	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	720	"""Read nodes and cache them in the lru.
	721
	722	The nodes list supplied is sorted and then read from disk, each node
	723	being inserted it into the _node_cache.
	724
	725	Note: Asking for more nodes than the _node_cache can contain will
	726	result in some of the results being immediately discarded, to prevent
	727	this an assertion is raised if more nodes are asked for than are
	728	cachable.
	729
	730	:return: A dict of {node_pos: node}
	731	"""
	732	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	733	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	734	for node_pos, node in self._read_nodes(sorted(nodes)):
	735	if node_pos == 0: # Special case
	736	self._root_node = node
	737	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	738	if start_of_leaves is None:
	739	start_of_leaves = self._row_offsets[-2]
	740	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	741	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	742	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	743	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	744	found[node_pos] = node
	745	return found
	746
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	747	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	748	"""Convert transport's recommended_page_size into btree pages.
	749
	750	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	751	pages fit in that length.
	752	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	753	recommended_read = self._transport.recommended_page_size()
	754	recommended_pages = int(math.ceil(recommended_read /
	755	float(_PAGE_SIZE)))
	756	return recommended_pages
	757
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	758	def _compute_total_pages_in_index(self):
	759	"""How many pages are in the index.
	760
	761	If we have read the header we will use the value stored there.
	762	Otherwise it will be computed based on the length of the index.
	763	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	764	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	765	raise AssertionError('_compute_total_pages_in_index should not be'
	766	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	767	if self._root_node is not None:
	768	# This is the number of pages as defined by the header
	769	return self._row_offsets[-1]
	770	# This is the number of pages as defined by the size of the index. They
	771	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	772	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	773	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	774
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	775	def _expand_offsets(self, offsets):
	776	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	777
	778	The idea is that we always want to make big-enough requests (like 64kB
	779	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	780	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	781	out what other pages we might want to read.
	782
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	783	See also doc/developers/btree_index_prefetch.txt for more details.
	784
	785	:param offsets: The offsets to be read
	786	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	787	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	788	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	789	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	790
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	791	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	792	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	793	if 'index' in debug.debug_flags:
	794	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	795	len(offsets), self._recommended_pages)
	796	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	797	if self._size is None:
	798	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	799	if 'index' in debug.debug_flags:
	800	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	801	return offsets
	802	total_pages = self._compute_total_pages_in_index()
	803	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	804	# If reading recommended_pages would read the rest of the index, just
	805	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	806	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	807	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	808	if cached_offsets:
	809	expanded = [x for x in xrange(total_pages)
	810	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	811	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	812	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	813	if 'index' in debug.debug_flags:
	814	trace.mutter(' reading all unread pages: %s', expanded)
	815	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	816
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	817	if self._root_node is None:
	818	# ATM on the first read of the root node of a large index, we don't
	819	# bother pre-reading any other pages. This is because the
	820	# likelyhood of actually reading interesting pages is very low.
	821	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	822	# a possible implementation when we are guessing that the second
	823	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	824	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	825	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	826	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	827	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	828	# We haven't read enough to justify expansion
	829	# If we are only going to read the root node, and 1 leaf node,
	830	# then it isn't worth expanding our request. Once we've read at
	831	# least 2 nodes, then we are probably doing a search, and we
	832	# start expanding our requests.
	833	if 'index' in debug.debug_flags:
	834	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	835	return offsets
	836	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	837	total_pages)
	838
	839	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	840	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	841	trace.mutter('expanded: %s', final_offsets)
	842	return final_offsets
	843
	844	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	845	"""Expand requests to neighbors until we have enough pages.
	846
	847	This is called from _expand_offsets after policy has determined that we
	848	want to expand.
	849	We only want to expand requests within a given layer. We cheat a little
	850	bit and assume all requests will be in the same layer. This is true
	851	given the current design, but if it changes this algorithm may perform
	852	oddly.
	853
	854	:param offsets: requested offsets
	855	:param cached_offsets: offsets for pages we currently have cached
	856	:return: A set() of offsets after expansion
	857	"""
	858	final_offsets = set(offsets)
	859	first = end = None
	860	new_tips = set(final_offsets)
	861	while len(final_offsets) < self._recommended_pages and new_tips:
	862	next_tips = set()
	863	for pos in new_tips:
	864	if first is None:
	865	first, end = self._find_layer_first_and_end(pos)
	866	previous = pos - 1
	867	if (previous > 0
	868	and previous not in cached_offsets
	869	and previous not in final_offsets
	870	and previous >= first):
	871	next_tips.add(previous)
	872	after = pos + 1
	873	if (after < total_pages
	874	and after not in cached_offsets
	875	and after not in final_offsets
	876	and after < end):
	877	next_tips.add(after)
	878	# This would keep us from going bigger than
	879	# recommended_pages by only expanding the first offsets.
	880	# However, if we are making a 'wide' request, it is
	881	# reasonable to expand all points equally.
	882	# if len(final_offsets) > recommended_pages:
	883	# break
	884	final_offsets.update(next_tips)
	885	new_tips = next_tips
	886	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	887
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	888	def clear_cache(self):
	889	"""Clear out any cached/memoized values.
	890
	891	This can be called at any time, but generally it is used when we have
	892	extracted some information, but don't expect to be requesting any more
	893	from this index.
	894	"""
	895	# Note that we don't touch self._root_node or self._internal_node_cache
	896	# We don't expect either of those to be big, and it can save
	897	# round-trips in the future. We may re-evaluate this if InternalNode
	898	# memory starts to be an issue.
	899	self._leaf_node_cache.clear()
	900
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	901	def external_references(self, ref_list_num):
	902	if self._root_node is None:
	903	self._get_root_node()
	904	if ref_list_num + 1 > self.node_ref_lists:
	905	raise ValueError('No ref list %d, index has %d ref lists'
	906	% (ref_list_num, self.node_ref_lists))
	907	keys = set()
	908	refs = set()
	909	for node in self.iter_all_entries():
	910	keys.add(node[1])
	911	refs.update(node[3][ref_list_num])
	912	return refs - keys
	913
3763.8.12 by John Arbash Meinel Code cleanup.	914	def _find_layer_first_and_end(self, offset):
	915	"""Find the start/stop nodes for the layer corresponding to offset.
	916
	917	:return: (first, end)
	918	first is the first node in this layer
	919	end is the first node of the next layer
	920	"""
	921	first = end = 0
	922	for roffset in self._row_offsets:
	923	first = end
	924	end = roffset
	925	if offset < roffset:
	926	break
	927	return first, end
	928
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	929	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	930	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	931	cached_offsets = set(self._internal_node_cache.keys())
	932	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	933	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	934	cached_offsets.add(0)
	935	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	936
	937	def _get_root_node(self):
	938	if self._root_node is None:
	939	# We may not have a root node yet
	940	self._get_internal_nodes([0])
	941	return self._root_node
	942
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	943	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	944	found = {}
	945	needed = []
	946	for idx in node_indexes:
	947	if idx == 0 and self._root_node is not None:
	948	found[0] = self._root_node
	949	continue
	950	try:
	951	found[idx] = cache[idx]
	952	except KeyError:
	953	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	954	if not needed:
	955	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	956	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	957	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	958	return found
	959
	960	def _get_internal_nodes(self, node_indexes):
	961	"""Get a node, from cache or disk.
	962
	963	After getting it, the node will be cached.
	964	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	965	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	966
3805.4.6 by John Arbash Meinel refactor for clarity.	967	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	968	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	969	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	970	for node in nodes.itervalues():
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	971	for key, value in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	972	if key in self._leaf_value_cache:
	973	# Don't add the rest of the keys, we've seen this node
	974	# before.
	975	break
	976	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	977
	978	def _get_leaf_nodes(self, node_indexes):
	979	"""Get a bunch of nodes, from cache or disk."""
	980	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	981	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	982	return found
	983
	984	def iter_all_entries(self):
	985	"""Iterate over all keys within the index.
	986
	987	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	988	The former tuple is used when there are no reference lists in the
	989	index, making the API compatible with simple key:value index types.
	990	There is no defined order for the result iteration - it will be in
	991	the most efficient order for the index.
	992	"""
	993	if 'evil' in debug.debug_flags:
	994	trace.mutter_callsite(3,
	995	"iter_all_entries scales with size of history.")
	996	if not self.key_count():
	997	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	998	if self._row_offsets[-1] == 1:
	999	# There is only the root node, and we read that via key_count()
	1000	if self.node_ref_lists:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1001	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1002	yield (self, key, value, refs)
	1003	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1004	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1005	yield (self, key, value)
	1006	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1007	start_of_leaves = self._row_offsets[-2]
	1008	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1009	needed_offsets = range(start_of_leaves, end_of_leaves)
	1010	if needed_offsets == [0]:
	1011	# Special case when we only have a root node, as we have already
	1012	# read everything
	1013	nodes = [(0, self._root_node)]
	1014	else:
	1015	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1016	# We iterate strictly in-order so that we can use this function
	1017	# for spilling index builds to disk.
	1018	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1019	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1020	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1021	yield (self, key, value, refs)
	1022	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1023	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1024	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1025	yield (self, key, value)
	1026
	1027	@staticmethod
	1028	def _multi_bisect_right(in_keys, fixed_keys):
	1029	"""Find the positions where each 'in_key' would fit in fixed_keys.
	1030
	1031	This is equivalent to doing "bisect_right" on each in_key into
	1032	fixed_keys
	1033
	1034	:param in_keys: A sorted list of keys to match with fixed_keys
	1035	:param fixed_keys: A sorted list of keys to match against
	1036	:return: A list of (integer position, [key list]) tuples.
	1037	"""
	1038	if not in_keys:
	1039	return []
	1040	if not fixed_keys:
	1041	# no pointers in the fixed_keys list, which means everything must
	1042	# fall to the left.
	1043	return [(0, in_keys)]
	1044
	1045	# TODO: Iterating both lists will generally take M + N steps
	1046	# Bisecting each key will generally take M * log2 N steps.
	1047	# If we had an efficient way to compare, we could pick the method
	1048	# based on which has the fewer number of steps.
	1049	# There is also the argument that bisect_right is a compiled
	1050	# function, so there is even more to be gained.
	1051	# iter_steps = len(in_keys) + len(fixed_keys)
	1052	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1053	if len(in_keys) == 1: # Bisect will always be faster for M = 1
5753.2.4 by Jelmer Vernooij Review feedback from John.	1054	return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1055	# elif bisect_steps < iter_steps:
	1056	# offsets = {}
	1057	# for key in in_keys:
	1058	# offsets.setdefault(bisect_right(fixed_keys, key),
	1059	# []).append(key)
	1060	# return [(o, offsets[o]) for o in sorted(offsets)]
	1061	in_keys_iter = iter(in_keys)
	1062	fixed_keys_iter = enumerate(fixed_keys)
	1063	cur_in_key = in_keys_iter.next()
	1064	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1065
	1066	class InputDone(Exception): pass
	1067	class FixedDone(Exception): pass
	1068
	1069	output = []
	1070	cur_out = []
	1071
	1072	# TODO: Another possibility is that rather than iterating on each side,
	1073	# we could use a combination of bisecting and iterating. For
	1074	# example, while cur_in_key < fixed_key, bisect to find its
	1075	# point, then iterate all matching keys, then bisect (restricted
	1076	# to only the remainder) for the next one, etc.
	1077	try:
	1078	while True:
	1079	if cur_in_key < cur_fixed_key:
	1080	cur_keys = []
	1081	cur_out = (cur_fixed_offset, cur_keys)
	1082	output.append(cur_out)
	1083	while cur_in_key < cur_fixed_key:
	1084	cur_keys.append(cur_in_key)
	1085	try:
	1086	cur_in_key = in_keys_iter.next()
	1087	except StopIteration:
	1088	raise InputDone
	1089	# At this point cur_in_key must be >= cur_fixed_key
	1090	# step the cur_fixed_key until we pass the cur key, or walk off
	1091	# the end
	1092	while cur_in_key >= cur_fixed_key:
	1093	try:
	1094	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1095	except StopIteration:
	1096	raise FixedDone
	1097	except InputDone:
	1098	# We consumed all of the input, nothing more to do
	1099	pass
	1100	except FixedDone:
	1101	# There was some input left, but we consumed all of fixed, so we
	1102	# have to add one more for the tail
	1103	cur_keys = [cur_in_key]
	1104	cur_keys.extend(in_keys_iter)
	1105	cur_out = (len(fixed_keys), cur_keys)
	1106	output.append(cur_out)
	1107	return output
	1108
4593.4.5 by John Arbash Meinel Start adding some tests.	1109	def _walk_through_internal_nodes(self, keys):
	1110	"""Take the given set of keys, and find the corresponding LeafNodes.
	1111
	1112	:param keys: An unsorted iterable of keys to search for
	1113	:return: (nodes, index_and_keys)
	1114	nodes is a dict mapping {index: LeafNode}
	1115	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1116	"""
	1117	# 6 seconds spent in miss_torture using the sorted() line.
	1118	# Even with out of order disk IO it seems faster not to sort it when
	1119	# large queries are being made.
	1120	keys_at_index = [(0, sorted(keys))]
	1121
	1122	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1123	node_indexes = [idx for idx, s_keys in keys_at_index]
	1124	nodes = self._get_internal_nodes(node_indexes)
	1125
	1126	next_nodes_and_keys = []
	1127	for node_index, sub_keys in keys_at_index:
	1128	node = nodes[node_index]
	1129	positions = self._multi_bisect_right(sub_keys, node.keys)
	1130	node_offset = next_row_start + node.offset
	1131	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1132	for pos, s_keys in positions])
	1133	keys_at_index = next_nodes_and_keys
	1134	# We should now be at the _LeafNodes
	1135	node_indexes = [idx for idx, s_keys in keys_at_index]
	1136
	1137	# TODO: We may not want to always read all the nodes in one
	1138	# big go. Consider setting a max size on this.
	1139	nodes = self._get_leaf_nodes(node_indexes)
	1140	return nodes, keys_at_index
	1141
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1142	def iter_entries(self, keys):
	1143	"""Iterate over keys within the index.
	1144
	1145	:param keys: An iterable providing the keys to be retrieved.
	1146	:return: An iterable as per iter_all_entries, but restricted to the
	1147	keys supplied. No additional keys will be returned, and every
	1148	key supplied that is in the index will be returned.
	1149	"""
	1150	# 6 seconds spent in miss_torture using the sorted() line.
	1151	# Even with out of order disk IO it seems faster not to sort it when
	1152	# large queries are being made.
	1153	# However, now that we are doing multi-way bisecting, we need the keys
	1154	# in sorted order anyway. We could change the multi-way code to not
	1155	# require sorted order. (For example, it bisects for the first node,
	1156	# does an in-order search until a key comes before the current point,
	1157	# which it then bisects for, etc.)
	1158	keys = frozenset(keys)
	1159	if not keys:
	1160	return
	1161
	1162	if not self.key_count():
	1163	return
	1164
	1165	needed_keys = []
	1166	if self._leaf_value_cache is None:
	1167	needed_keys = keys
	1168	else:
	1169	for key in keys:
	1170	value = self._leaf_value_cache.get(key, None)
	1171	if value is not None:
	1172	# This key is known not to be here, skip it
	1173	value, refs = value
	1174	if self.node_ref_lists:
	1175	yield (self, key, value, refs)
	1176	else:
	1177	yield (self, key, value)
	1178	else:
	1179	needed_keys.append(key)
	1180
	1181	last_key = None
	1182	needed_keys = keys
	1183	if not needed_keys:
	1184	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1185	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1186	for node_index, sub_keys in nodes_and_keys:
	1187	if not sub_keys:
	1188	continue
	1189	node = nodes[node_index]
	1190	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1191	if next_sub_key in node:
	1192	value, refs = node[next_sub_key]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1193	if self.node_ref_lists:
	1194	yield (self, next_sub_key, value, refs)
	1195	else:
	1196	yield (self, next_sub_key, value)
	1197
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1198	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1199	"""Find the parent_map information for the set of keys.
	1200
	1201	This populates the parent_map dict and missing_keys set based on the
	1202	queried keys. It also can fill out an arbitrary number of parents that
	1203	it finds while searching for the supplied keys.
	1204
	1205	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1206	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1207
	1208	:param keys: A keys whose ancestry we want to return
	1209	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1210	:param ref_list_num: This index in the ref_lists is the parents we
	1211	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1212	:param parent_map: {key: parent_keys} for keys that are present in this
	1213	index. This may contain more entries than were in 'keys', that are
	1214	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1215	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1216	This may include parents that were not directly requested, but we
	1217	were able to determine that they are not present in this index.
	1218	:return: search_keys parents that were found but not queried to know
	1219	if they are missing or present. Callers can re-query this index for
	1220	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1221	"""
	1222	if not self.key_count():
	1223	# We use key_count() to trigger reading the root node and
	1224	# determining info about this BTreeGraphIndex
	1225	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1226	missing_keys.update(keys)
	1227	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1228	if ref_list_num >= self.node_ref_lists:
	1229	raise ValueError('No ref list %d, index has %d ref lists'
	1230	% (ref_list_num, self.node_ref_lists))
	1231
	1232	# The main trick we are trying to accomplish is that when we find a
	1233	# key listing its parents, we expect that the parent key is also likely
	1234	# to sit on the same page. Allowing us to expand parents quickly
	1235	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1236	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1237
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1238	# These are parent keys which could not be immediately resolved on the
	1239	# page where the child was present. Note that we may already be
	1240	# searching for that key, and it may actually be present [or known
	1241	# missing] on one of the other pages we are reading.
	1242	# TODO:
	1243	# We could try searching for them in the immediate previous or next
	1244	# page. If they occur "later" we could put them in a pending lookup
	1245	# set, and then for each node we read thereafter we could check to
	1246	# see if they are present.
	1247	# However, we don't know the impact of keeping this list of things
	1248	# that I'm going to search for every node I come across from here on
	1249	# out.
	1250	# It doesn't handle the case when the parent key is missing on a
	1251	# page that we don't read. So we already have to handle being
	1252	# re-entrant for that.
	1253	# Since most keys contain a date string, they are more likely to be
	1254	# found earlier in the file than later, but we would know that right
	1255	# away (key < min_key), and wouldn't keep searching it on every other
	1256	# page that we read.
	1257	# Mostly, it is an idea, one which should be benchmarked.
	1258	parents_not_on_page = set()
	1259
	1260	for node_index, sub_keys in nodes_and_keys:
	1261	if not sub_keys:
	1262	continue
	1263	# sub_keys is all of the keys we are looking for that should exist
	1264	# on this page, if they aren't here, then they won't be found
	1265	node = nodes[node_index]
	1266	parents_to_check = set()
	1267	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1268	if next_sub_key not in node:
4593.4.5 by John Arbash Meinel Start adding some tests.	1269	# This one is just not present in the index at all
	1270	missing_keys.add(next_sub_key)
	1271	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1272	value, refs = node[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1273	parent_keys = refs[ref_list_num]
	1274	parent_map[next_sub_key] = parent_keys
	1275	parents_to_check.update(parent_keys)
	1276	# Don't look for things we've already found
	1277	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1278	# this can be used to test the benefit of having the check loop
	1279	# inlined.
	1280	# parents_not_on_page.update(parents_to_check)
	1281	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1282	while parents_to_check:
	1283	next_parents_to_check = set()
	1284	for key in parents_to_check:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1285	if key in node:
	1286	value, refs = node[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1287	parent_keys = refs[ref_list_num]
	1288	parent_map[key] = parent_keys
	1289	next_parents_to_check.update(parent_keys)
	1290	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1291	# This parent either is genuinely missing, or should be
	1292	# found on another page. Perf test whether it is better
	1293	# to check if this node should fit on this page or not.
	1294	# in the 'everything-in-one-pack' scenario, this not
	1295	# doing the check is 237ms vs 243ms.
	1296	# So slightly better, but I assume the standard 'lots
	1297	# of packs' is going to show a reasonable improvement
	1298	# from the check, because it avoids 'going around
	1299	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1300	# parents_not_on_page.add(key)
	1301	# Missing for some reason
	1302	if key < node.min_key:
	1303	# in the case of bzr.dev, 3.4k/5.3k misses are
	1304	# 'earlier' misses (65%)
	1305	parents_not_on_page.add(key)
	1306	elif key > node.max_key:
	1307	# This parent key would be present on a different
	1308	# LeafNode
	1309	parents_not_on_page.add(key)
	1310	else:
	1311	# assert key != node.min_key and key != node.max_key
	1312	# If it was going to be present, it would be on
	1313	# this page, so mark it missing.
	1314	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1315	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1316	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1317	# parents_not_on_page could have been found on a different page, or be
	1318	# known to be missing. So cull out everything that has already been
	1319	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1320	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1321	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1322	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1323
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1324	def iter_entries_prefix(self, keys):
	1325	"""Iterate over keys within the index using prefix matching.
	1326
	1327	Prefix matching is applied within the tuple of a key, not to within
	1328	the bytestring of each key element. e.g. if you have the keys ('foo',
	1329	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1330	only the former key is returned.
	1331
	1332	WARNING: Note that this method currently causes a full index parse
	1333	unconditionally (which is reasonably appropriate as it is a means for
	1334	thunking many small indices into one larger one and still supplies
	1335	iter_all_entries at the thunk layer).
	1336
	1337	:param keys: An iterable providing the key prefixes to be retrieved.
	1338	Each key prefix takes the form of a tuple the length of a key, but
	1339	with the last N elements 'None' rather than a regular bytestring.
	1340	The first element cannot be 'None'.
	1341	:return: An iterable as per iter_all_entries, but restricted to the
	1342	keys with a matching prefix to those supplied. No additional keys
	1343	will be returned, and every match that is in the index will be
	1344	returned.
	1345	"""
	1346	keys = sorted(set(keys))
	1347	if not keys:
	1348	return
	1349	# Load if needed to check key lengths
	1350	if self._key_count is None:
	1351	self._get_root_node()
	1352	# TODO: only access nodes that can satisfy the prefixes we are looking
	1353	# for. For now, to meet API usage (as this function is not used by
	1354	# current bzrlib) just suck the entire index and iterate in memory.
	1355	nodes = {}
	1356	if self.node_ref_lists:
	1357	if self._key_length == 1:
	1358	for _1, key, value, refs in self.iter_all_entries():
	1359	nodes[key] = value, refs
	1360	else:
	1361	nodes_by_key = {}
	1362	for _1, key, value, refs in self.iter_all_entries():
	1363	key_value = key, value, refs
	1364	# For a key of (foo, bar, baz) create
	1365	# _nodes_by_key[foo][bar][baz] = key_value
	1366	key_dict = nodes_by_key
	1367	for subkey in key[:-1]:
	1368	key_dict = key_dict.setdefault(subkey, {})
	1369	key_dict[key[-1]] = key_value
	1370	else:
	1371	if self._key_length == 1:
	1372	for _1, key, value in self.iter_all_entries():
	1373	nodes[key] = value
	1374	else:
	1375	nodes_by_key = {}
	1376	for _1, key, value in self.iter_all_entries():
	1377	key_value = key, value
	1378	# For a key of (foo, bar, baz) create
	1379	# _nodes_by_key[foo][bar][baz] = key_value
	1380	key_dict = nodes_by_key
	1381	for subkey in key[:-1]:
	1382	key_dict = key_dict.setdefault(subkey, {})
	1383	key_dict[key[-1]] = key_value
	1384	if self._key_length == 1:
	1385	for key in keys:
	1386	# sanity check
	1387	if key[0] is None:
1388	raise errors.BadIndexKey(key)
1389	if len(key) != self._key_length:
1390	raise errors.BadIndexKey(key)
1391	try:
1392	if self.node_ref_lists:
1393	value, node_refs = nodes[key]
1394	yield self, key, value, node_refs
1395	else:
1396	yield self, key, nodes[key]
1397	except KeyError:
1398	pass
1399	return
1400	for key in keys:
1401	# sanity check
1402	if key[0] is None:
1403	raise errors.BadIndexKey(key)
1404	if len(key) != self._key_length:
1405	raise errors.BadIndexKey(key)
1406	# find what it refers to:
1407	key_dict = nodes_by_key
1408	elements = list(key)
1409	# find the subdict whose contents should be returned.
1410	try:
1411	while len(elements) and elements[0] is not None:
1412	key_dict = key_dict[elements[0]]
1413	elements.pop(0)
1414	except KeyError:
1415	# a non-existant lookup.
1416	continue
1417	if len(elements):
1418	dicts = [key_dict]
1419	while dicts:
1420	key_dict = dicts.pop(-1)
1421	# can't be empty or would not exist
1422	item, value = key_dict.iteritems().next()
1423	if type(value) == dict:
1424	# push keys
1425	dicts.extend(key_dict.itervalues())
1426	else:
1427	# yield keys
1428	for value in key_dict.itervalues():
1429	# each value is the key:value:node refs tuple
1430	# ready to yield.
1431	yield (self, ) + value
1432	else:
1433	# the last thing looked up was a terminal element
1434	yield (self, ) + key_dict
1435
1436	def key_count(self):
1437	"""Return an estimate of the number of keys in this index.
1438
1439	For BTreeGraphIndex the estimate is exact as it is contained in the
1440	header.
1441	"""
1442	if self._key_count is None:
1443	self._get_root_node()
1444	return self._key_count
1445
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1446	def _compute_row_offsets(self):
	1447	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1448	offsets = []
	1449	row_offset = 0
	1450	for row in self._row_lengths:
	1451	offsets.append(row_offset)
	1452	row_offset += row
	1453	offsets.append(row_offset)
	1454	self._row_offsets = offsets
	1455
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1456	def _parse_header_from_bytes(self, bytes):
	1457	"""Parse the header from a region of bytes.
	1458
	1459	:param bytes: The data to parse.
	1460	:return: An offset, data tuple such as readv yields, for the unparsed
	1461	data. (which may be of length 0).
	1462	"""
	1463	signature = bytes[0:len(self._signature())]
	1464	if not signature == self._signature():
	1465	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1466	lines = bytes[len(self._signature()):].splitlines()
	1467	options_line = lines[0]
	1468	if not options_line.startswith(_OPTION_NODE_REFS):
	1469	raise errors.BadIndexOptions(self)
	1470	try:
	1471	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1472	except ValueError:
	1473	raise errors.BadIndexOptions(self)
	1474	options_line = lines[1]
	1475	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1476	raise errors.BadIndexOptions(self)
	1477	try:
	1478	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1479	except ValueError:
	1480	raise errors.BadIndexOptions(self)
	1481	options_line = lines[2]
	1482	if not options_line.startswith(_OPTION_LEN):
	1483	raise errors.BadIndexOptions(self)
	1484	try:
	1485	self._key_count = int(options_line[len(_OPTION_LEN):])
	1486	except ValueError:
	1487	raise errors.BadIndexOptions(self)
	1488	options_line = lines[3]
	1489	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1490	raise errors.BadIndexOptions(self)
	1491	try:
	1492	self._row_lengths = map(int, [length for length in
	1493	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1494	if len(length)])
	1495	except ValueError:
	1496	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1497	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1498
	1499	# calculate the bytes we have processed
	1500	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1501	return header_end, bytes[header_end:]
	1502
	1503	def _read_nodes(self, nodes):
	1504	"""Read some nodes from disk into the LRU cache.
	1505
	1506	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1507	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1508	supplied order. If possible doing sort() on the list before requesting
	1509	a read may improve performance.
	1510
	1511	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1512	:return: None
	1513	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1514	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1515	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1516	# list of (offset, length) regions of the file that should, evenually
	1517	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1518	ranges = []
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1519	base_offset = self._base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1520	for index in nodes:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1521	offset = (index * _PAGE_SIZE)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1522	size = _PAGE_SIZE
	1523	if index == 0:
	1524	# Root node - special case
	1525	if self._size:
	1526	size = min(_PAGE_SIZE, self._size)
	1527	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1528	# The only case where we don't know the size, is for very
	1529	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1530	bytes = self._transport.get_bytes(self._name)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1531	num_bytes = len(bytes)
	1532	self._size = num_bytes - base_offset
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1533	# the whole thing should be parsed out of 'bytes'
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1534	ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
	1535	for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1536	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1537	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1538	if offset > self._size:
	1539	raise AssertionError('tried to read past the end'
	1540	' of the file %s > %s'
	1541	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1542	size = min(size, self._size - offset)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1543	ranges.append((base_offset + offset, size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1544	if not ranges:
	1545	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1546	elif bytes is not None:
	1547	# already have the whole file
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1548	data_ranges = [(start, bytes[start:start+size])
	1549	for start, size in ranges]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1550	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1551	data_ranges = self._transport.readv(self._name, ranges)
	1552	else:
	1553	data_ranges = []
	1554	for offset, size in ranges:
	1555	self._file.seek(offset)
	1556	data_ranges.append((offset, self._file.read(size)))
	1557	for offset, data in data_ranges:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1558	offset -= base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1559	if offset == 0:
	1560	# extract the header
	1561	offset, data = self._parse_header_from_bytes(data)
	1562	if len(data) == 0:
	1563	continue
	1564	bytes = zlib.decompress(data)
	1565	if bytes.startswith(_LEAF_FLAG):
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1566	node = self._leaf_factory(bytes, self._key_length,
	1567	self.node_ref_lists)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1568	elif bytes.startswith(_INTERNAL_FLAG):
	1569	node = _InternalNode(bytes)
	1570	else:
	1571	raise AssertionError("Unknown node type for %r" % bytes)
	1572	yield offset / _PAGE_SIZE, node
	1573
	1574	def _signature(self):
	1575	"""The file signature for this index type."""
	1576	return _BTSIGNATURE
	1577
	1578	def validate(self):
	1579	"""Validate that everything in the index can be accessed."""
	1580	# just read and parse every node.
	1581	self._get_root_node()
	1582	if len(self._row_lengths) > 1:
	1583	start_node = self._row_offsets[1]
	1584	else:
	1585	# We shouldn't be reading anything anyway
	1586	start_node = 1
	1587	node_end = self._row_offsets[-1]
	1588	for node in self._read_nodes(range(start_node, node_end)):
	1589	pass
	1590
	1591
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1592	_gcchk_factory = _LeafNode
	1593
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1594	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1595	from bzrlib import _btree_serializer_pyx as _btree_serializer
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1596	_gcchk_factory = _btree_serializer._parse_into_chk
4574.3.6 by Martin Pool More warnings when failing to load extensions	1597	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1598	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1599	from bzrlib import _btree_serializer_py as _btree_serializer