~bzr-pqm/bzr/bzr.dev : contents of bzrlib/btree

~bzr-pqm/bzr/bzr.dev : (revision 4747.2.1)

3641.3.29 by John Arbash Meinel Cleanup the copyright headers	1	# Copyright (C) 2008 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	21	from bisect import bisect_right
	22	import math
	23	import tempfile
	24	import zlib
	25
	26	from bzrlib import (
	27	chunk_writer,
	28	debug,
	29	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	30	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31	index,
	32	lru_cache,
	33	osutils,
	34	trace,
	35	)
	36	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	37	from bzrlib.transport import get_transport
	38
	39
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	40	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	_OPTION_ROW_LENGTHS = "row_lengths="
	42	_LEAF_FLAG = "type=leaf\n"
	43	_INTERNAL_FLAG = "type=internal\n"
	44	_INTERNAL_OFFSET = "offset="
	45
	46	_RESERVED_HEADER_BYTES = 120
	47	_PAGE_SIZE = 4096
	48
	49	# 4K per page: 4MB - 1000 entries
	50	_NODE_CACHE_SIZE = 1000
	51
	52
	53	class _BuilderRow(object):
	54	"""The stored state accumulated while writing out a row in the index.
	55
	56	:ivar spool: A temporary file used to accumulate nodes for this row
	57	in the tree.
	58	:ivar nodes: The count of nodes emitted so far.
	59	"""
	60
	61	def __init__(self):
	62	"""Create a _BuilderRow."""
	63	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	64	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	65	self.writer = None
	66
	67	def finish_node(self, pad=True):
	68	byte_lines, _, padding = self.writer.finish()
	69	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	70	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	71	# padded note:
	72	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	73	elif self.nodes == 1:
	74	# We got bigger than 1 node, switch to a temp file
	75	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	76	spool.write(self.spool.getvalue())
	77	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	skipped_bytes = 0
	79	if not pad and padding:
	80	del byte_lines[-1]
	81	skipped_bytes = padding
	82	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	83	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	84	if remainder != 0:
	85	raise AssertionError("incorrect node length: %d, %d"
	86	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	87	self.nodes += 1
	88	self.writer = None
	89
	90
	91	class _InternalBuilderRow(_BuilderRow):
	92	"""The stored state accumulated while writing out internal rows."""
	93
	94	def finish_node(self, pad=True):
	95	if not pad:
	96	raise AssertionError("Must pad internal nodes only.")
	97	_BuilderRow.finish_node(self)
	98
	99
	100	class _LeafBuilderRow(_BuilderRow):
	101	"""The stored state accumulated while writing out a leaf rows."""
	102
	103
	104	class BTreeBuilder(index.GraphIndexBuilder):
	105	"""A Builder for B+Tree based Graph indices.
	106
	107	The resulting graph has the structure:
	108
	109	_SIGNATURE OPTIONS NODES
	110	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	111	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	112	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	113	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	114	LENGTH := 'len=' DIGITS NEWLINE
	115	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	116	NODES := NODE_COMPRESSED*
	117	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	118	NODE_RAW := INTERNAL \| LEAF
	119	INTERNAL := INTERNAL_FLAG POINTERS
	120	LEAF := LEAF_FLAG ROWS
	121	KEY_ELEMENT := Not-whitespace-utf8
	122	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	123	ROWS := ROW*
	124	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	125	ABSENT := 'a'
	126	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	127	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	128	REFERENCE := KEY
	129	VALUE := no-newline-no-null-bytes
	130	"""
	131
	132	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	133	"""See GraphIndexBuilder.__init__.
	134
	135	:param spill_at: Optional parameter controlling the maximum number
	136	of nodes that BTreeBuilder will hold in memory.
	137	"""
	138	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	139	key_elements=key_elements)
	140	self._spill_at = spill_at
	141	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	142	# A map of {key: (node_refs, value)}
	143	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	144	# Indicate it hasn't been built yet
	145	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	146	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	147
	148	def add_node(self, key, value, references=()):
	149	"""Add a node to the index.
	150
	151	If adding the node causes the builder to reach its spill_at threshold,
	152	disk spilling will be triggered.
	153
	154	:param key: The key. keys are non-empty tuples containing
	155	as many whitespace-free utf8 bytestrings as the key length
	156	defined for this index.
	157	:param references: An iterable of iterables of keys. Each is a
	158	reference to another key.
	159	:param value: The value to associate with the key. It may be any
	160	bytes as long as it does not contain \0 or \n.
	161	"""
3644.2.9 by John Arbash Meinel Refactor some code.	162	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	163	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	164	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	165	raise errors.BadIndexDuplicateKey(key, self)
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	166	self._nodes[key] = (node_refs, value)
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	167	self._keys.add(key)
3644.2.9 by John Arbash Meinel Refactor some code.	168	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	169	self._update_nodes_by_key(key, value, node_refs)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	170	if len(self._keys) < self._spill_at:
	171	return
3644.2.9 by John Arbash Meinel Refactor some code.	172	self._spill_mem_keys_to_disk()
	173
	174	def _spill_mem_keys_to_disk(self):
	175	"""Write the in memory keys down to disk to cap memory consumption.
	176
	177	If we already have some keys written to disk, we will combine them so
	178	as to preserve the sorted order. The algorithm for combining uses
	179	powers of two. So on the first spill, write all mem nodes into a
	180	single index. On the second spill, combine the mem nodes with the nodes
	181	on disk to create a 2x sized disk index and get rid of the first index.
	182	On the third spill, create a single new disk index, which will contain
	183	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	184	combine mem with the first and second indexes, creating a new one of
	185	size 4x. On the fifth create a single new one, etc.
	186	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	187	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	188	(new_backing_file, size,
	189	backing_pos) = self._spill_mem_keys_and_combine()
	190	else:
	191	new_backing_file, size = self._spill_mem_keys_without_combining()
	192	# Note: The transport here isn't strictly needed, because we will use
	193	# direct access to the new_backing._file object
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	194	new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	195	# GC will clean up the file
	196	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	197	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	198	if len(self._backing_indices) == backing_pos:
	199	self._backing_indices.append(None)
	200	self._backing_indices[backing_pos] = new_backing
	201	for backing_pos in range(backing_pos):
	202	self._backing_indices[backing_pos] = None
	203	else:
	204	self._backing_indices.append(new_backing)
	205	self._keys = set()
	206	self._nodes = {}
	207	self._nodes_by_key = None
	208
	209	def _spill_mem_keys_without_combining(self):
	210	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	211
	212	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	213	iterators_to_combine = [self._iter_mem_nodes()]
	214	pos = -1
	215	for pos, backing in enumerate(self._backing_indices):
	216	if backing is None:
	217	pos -= 1
	218	break
	219	iterators_to_combine.append(backing.iter_all_entries())
	220	backing_pos = pos + 1
	221	new_backing_file, size = \
	222	self._write_nodes(self._iter_smallest(iterators_to_combine),
	223	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	224	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	225
	226	def add_nodes(self, nodes):
	227	"""Add nodes to the index.
	228
	229	:param nodes: An iterable of (key, node_refs, value) entries to add.
	230	"""
	231	if self.reference_lists:
	232	for (key, value, node_refs) in nodes:
	233	self.add_node(key, value, node_refs)
	234	else:
	235	for (key, value) in nodes:
	236	self.add_node(key, value)
	237
	238	def _iter_mem_nodes(self):
	239	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	240	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	241	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	242	for key in sorted(nodes):
	243	references, value = nodes[key]
	244	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	245	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	246	for key in sorted(nodes):
	247	references, value = nodes[key]
	248	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	249
	250	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	251	if len(iterators_to_combine) == 1:
	252	for value in iterators_to_combine[0]:
	253	yield value
	254	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	255	current_values = []
	256	for iterator in iterators_to_combine:
	257	try:
	258	current_values.append(iterator.next())
	259	except StopIteration:
	260	current_values.append(None)
	261	last = None
	262	while True:
	263	# Decorate candidates with the value to allow 2.4's min to be used.
	264	candidates = [(item[1][1], item) for item
	265	in enumerate(current_values) if item[1] is not None]
	266	if not len(candidates):
	267	return
	268	selected = min(candidates)
	269	# undecorate back to (pos, node)
	270	selected = selected[1]
	271	if last == selected[1][1]:
	272	raise errors.BadIndexDuplicateKey(last, self)
	273	last = selected[1][1]
	274	# Yield, with self as the index
	275	yield (self,) + selected[1][1:]
	276	pos = selected[0]
	277	try:
	278	current_values[pos] = iterators_to_combine[pos].next()
	279	except StopIteration:
	280	current_values[pos] = None
	281
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	282	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	283	"""Add a key to the current chunk.
	284
	285	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	286	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	287	:param allow_optimize: If set to False, prevent setting the optimize
	288	flag when writing out. This is used by the _spill_mem_keys_to_disk
	289	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	290	"""
	291	if rows[-1].writer is None:
	292	# opening a new leaf chunk;
	293	for pos, internal_row in enumerate(rows[:-1]):
	294	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	295	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	296	if internal_row.writer is None:
	297	length = _PAGE_SIZE
	298	if internal_row.nodes == 0:
	299	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	300	if allow_optimize:
	301	optimize_for_size = self._optimize_for_size
	302	else:
	303	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	304	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	305	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	306	internal_row.writer.write(_INTERNAL_FLAG)
	307	internal_row.writer.write(_INTERNAL_OFFSET +
	308	str(rows[pos + 1].nodes) + "\n")
	309	# add a new leaf
	310	length = _PAGE_SIZE
	311	if rows[-1].nodes == 0:
	312	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	313	rows[-1].writer = chunk_writer.ChunkWriter(length,
	314	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	315	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	316	if rows[-1].writer.write(line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	317	# this key did not fit in the node:
	318	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	319	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	320	new_row = True
	321	for row in reversed(rows[:-1]):
	322	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	323	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	324	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	325	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	326	row.finish_node()
	327	else:
	328	# We've found a node that can handle the pointer.
	329	new_row = False
	330	break
	331	# If we reached the current root without being able to mark the
	332	# division point, then we need a new root:
	333	if new_row:
	334	# We need a new row
	335	if 'index' in debug.debug_flags:
	336	trace.mutter('Inserting new global row.')
	337	new_row = _InternalBuilderRow()
	338	reserved_bytes = 0
	339	rows.insert(0, new_row)
	340	# This will be padded, hence the -100
	341	new_row.writer = chunk_writer.ChunkWriter(
	342	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	343	reserved_bytes,
	344	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	345	new_row.writer.write(_INTERNAL_FLAG)
	346	new_row.writer.write(_INTERNAL_OFFSET +
	347	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	348	new_row.writer.write(key_line)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	349	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	350
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	351	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	352	"""Write node_iterator out as a B+Tree.
	353
	354	:param node_iterator: An iterator of sorted nodes. Each node should
	355	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	356	:param allow_optimize: If set to False, prevent setting the optimize
	357	flag when writing out. This is used by the _spill_mem_keys_to_disk
	358	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	359	:return: A file handle for a temporary file containing a B+Tree for
	360	the nodes.
	361	"""
	362	# The index rows - rows[0] is the root, rows[1] is the layer under it
	363	# etc.
	364	rows = []
	365	# forward sorted by key. In future we may consider topological sorting,
	366	# at the cost of table scans for direct lookup, or a second index for
	367	# direct lookup
	368	key_count = 0
	369	# A stack with the number of nodes of each size. 0 is the root node
	370	# and must always be 1 (if there are any nodes in the tree).
	371	self.row_lengths = []
	372	# Loop over all nodes adding them to the bottom row
	373	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	374	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	375	# row above, transitively.
	376	for node in node_iterator:
	377	if key_count == 0:
	378	# First key triggers the first row
	379	rows.append(_LeafBuilderRow())
	380	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	381	string_key, line = _btree_serializer._flatten_node(node,
	382	self.reference_lists)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	383	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	384	for row in reversed(rows):
	385	pad = (type(row) != _LeafBuilderRow)
	386	row.finish_node(pad=pad)
	387	lines = [_BTSIGNATURE]
	388	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	389	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	390	lines.append(_OPTION_LEN + str(key_count) + '\n')
	391	row_lengths = [row.nodes for row in rows]
	392	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	393	if row_lengths and row_lengths[-1] > 1:
	394	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	395	else:
	396	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	397	result.writelines(lines)
	398	position = sum(map(len, lines))
	399	root_row = True
	400	if position > _RESERVED_HEADER_BYTES:
	401	raise AssertionError("Could not fit the header in the"
	402	" reserved space: %d > %d"
	403	% (position, _RESERVED_HEADER_BYTES))
	404	# write the rows out:
	405	for row in rows:
	406	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	407	row.spool.flush()
	408	row.spool.seek(0)
	409	# copy nodes to the finalised file.
	410	# Special case the first node as it may be prefixed
	411	node = row.spool.read(_PAGE_SIZE)
	412	result.write(node[reserved:])
	413	result.write("\x00" * (reserved - position))
	414	position = 0 # Only the root row actually has an offset
	415	copied_len = osutils.pumpfile(row.spool, result)
	416	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	417	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	418	raise AssertionError("Incorrect amount of data copied"
	419	" expected: %d, got: %d"
	420	% ((row.nodes - 1) * _PAGE_SIZE,
	421	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	422	result.flush()
	423	size = result.tell()
	424	result.seek(0)
	425	return result, size
	426
	427	def finish(self):
	428	"""Finalise the index.
	429
	430	:return: A file handle for a temporary file containing the nodes added
	431	to the index.
	432	"""
	433	return self._write_nodes(self.iter_all_entries())[0]
	434
	435	def iter_all_entries(self):
	436	"""Iterate over all keys within the index
	437
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	438	:return: An iterable of (index, key, value, reference_lists). There is
	439	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	440	efficient order for the index (in this case dictionary hash order).
	441	"""
	442	if 'evil' in debug.debug_flags:
	443	trace.mutter_callsite(3,
	444	"iter_all_entries scales with size of history.")
	445	# Doing serial rather than ordered would be faster; but this shouldn't
	446	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	447	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	448	for backing in self._backing_indices:
	449	if backing is not None:
	450	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	451	if len(iterators) == 1:
	452	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	453	return self._iter_smallest(iterators)
	454
	455	def iter_entries(self, keys):
	456	"""Iterate over keys within the index.
	457
	458	:param keys: An iterable providing the keys to be retrieved.
	459	:return: An iterable of (index, key, value, reference_lists). There is no
	460	defined order for the result iteration - it will be in the most
	461	efficient order for the index (keys iteration order in this case).
	462	"""
	463	keys = set(keys)
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	464	local_keys = keys.intersection(self._keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	465	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	466	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	467	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	468	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	469	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	470	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	471	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	472	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	473	# Find things that are in backing indices that have not been handled
	474	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	475	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	476	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	477	# Remove all of the keys that we found locally
	478	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	479	for backing in self._backing_indices:
	480	if backing is None:
	481	continue
	482	if not keys:
	483	return
	484	for node in backing.iter_entries(keys):
	485	keys.remove(node[1])
	486	yield (self,) + node[1:]
	487
	488	def iter_entries_prefix(self, keys):
	489	"""Iterate over keys within the index using prefix matching.
	490
	491	Prefix matching is applied within the tuple of a key, not to within
	492	the bytestring of each key element. e.g. if you have the keys ('foo',
	493	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	494	only the former key is returned.
	495
	496	:param keys: An iterable providing the key prefixes to be retrieved.
	497	Each key prefix takes the form of a tuple the length of a key, but
	498	with the last N elements 'None' rather than a regular bytestring.
	499	The first element cannot be 'None'.
	500	:return: An iterable as per iter_all_entries, but restricted to the
	501	keys with a matching prefix to those supplied. No additional keys
	502	will be returned, and every match that is in the index will be
	503	returned.
	504	"""
	505	# XXX: To much duplication with the GraphIndex class; consider finding
	506	# a good place to pull out the actual common logic.
	507	keys = set(keys)
	508	if not keys:
	509	return
	510	for backing in self._backing_indices:
	511	if backing is None:
	512	continue
	513	for node in backing.iter_entries_prefix(keys):
	514	yield (self,) + node[1:]
	515	if self._key_length == 1:
	516	for key in keys:
	517	# sanity check
	518	if key[0] is None:
	519	raise errors.BadIndexKey(key)
	520	if len(key) != self._key_length:
	521	raise errors.BadIndexKey(key)
	522	try:
	523	node = self._nodes[key]
	524	except KeyError:
	525	continue
	526	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	527	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	528	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	529	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	530	return
	531	for key in keys:
	532	# sanity check
	533	if key[0] is None:
	534	raise errors.BadIndexKey(key)
	535	if len(key) != self._key_length:
	536	raise errors.BadIndexKey(key)
	537	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	538	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	539	elements = list(key)
	540	# find the subdict to return
	541	try:
	542	while len(elements) and elements[0] is not None:
	543	key_dict = key_dict[elements[0]]
	544	elements.pop(0)
	545	except KeyError:
	546	# a non-existant lookup.
	547	continue
	548	if len(elements):
	549	dicts = [key_dict]
	550	while dicts:
	551	key_dict = dicts.pop(-1)
	552	# can't be empty or would not exist
	553	item, value = key_dict.iteritems().next()
	554	if type(value) == dict:
	555	# push keys
	556	dicts.extend(key_dict.itervalues())
	557	else:
	558	# yield keys
	559	for value in key_dict.itervalues():
	560	yield (self, ) + value
	561	else:
	562	yield (self, ) + key_dict
	563
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	564	def _get_nodes_by_key(self):
	565	if self._nodes_by_key is None:
	566	nodes_by_key = {}
	567	if self.reference_lists:
	568	for key, (references, value) in self._nodes.iteritems():
	569	key_dict = nodes_by_key
	570	for subkey in key[:-1]:
	571	key_dict = key_dict.setdefault(subkey, {})
	572	key_dict[key[-1]] = key, value, references
	573	else:
	574	for key, (references, value) in self._nodes.iteritems():
	575	key_dict = nodes_by_key
	576	for subkey in key[:-1]:
	577	key_dict = key_dict.setdefault(subkey, {})
	578	key_dict[key[-1]] = key, value
	579	self._nodes_by_key = nodes_by_key
	580	return self._nodes_by_key
	581
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	582	def key_count(self):
	583	"""Return an estimate of the number of keys in this index.
	584
	585	For InMemoryGraphIndex the estimate is exact.
	586	"""
	587	return len(self._keys) + sum(backing.key_count() for backing in
	588	self._backing_indices if backing is not None)
	589
	590	def validate(self):
	591	"""In memory index's have no known corruption at the moment."""
	592
	593
	594	class _LeafNode(object):
	595	"""A leaf node for a serialised B+Tree index."""
	596
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	597	__slots__ = ('keys', 'min_key', 'max_key')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	598
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	599	def __init__(self, bytes, key_length, ref_list_length):
	600	"""Parse bytes to create a leaf node object."""
	601	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	602	key_list = _btree_serializer._parse_leaf_lines(bytes,
	603	key_length, ref_list_length)
	604	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	605	self.min_key = key_list[0][0]
	606	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	607	else:
	608	self.min_key = self.max_key = None
	609	self.keys = dict(key_list)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	610
	611
	612	class _InternalNode(object):
	613	"""An internal node for a serialised B+Tree index."""
	614
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	615	__slots__ = ('keys', 'offset')
	616
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	617	def __init__(self, bytes):
	618	"""Parse bytes to create an internal node object."""
	619	# splitlines mangles the \r delimiters.. don't use it.
	620	self.keys = self._parse_lines(bytes.split('\n'))
	621
	622	def _parse_lines(self, lines):
	623	nodes = []
	624	self.offset = int(lines[1][7:])
	625	for line in lines[2:]:
	626	if line == '':
	627	break
4075.3.1 by John Arbash Meinel Use PyString_InternInPlace to intern() the various parts of keys that are processed.	628	nodes.append(tuple(map(intern, line.split('\0'))))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	629	return nodes
	630
	631
	632	class BTreeGraphIndex(object):
	633	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	634
	635	Individual nodes are held in a LRU cache. This holds the root node in
	636	memory except when very large walks are done.
	637	"""
	638
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	639	def __init__(self, transport, name, size, unlimited_cache=False):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	640	"""Create a B+Tree index object on the index name.
	641
	642	:param transport: The transport to read data for the index from.
	643	:param name: The file name of the index on transport.
	644	:param size: Optional size of the index in bytes. This allows
	645	compatibility with the GraphIndex API, as well as ensuring that
	646	the initial read (to read the root node header) can be done
	647	without over-reading even on empty indices, and on small indices
	648	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	649	:param unlimited_cache: If set to True, then instead of using an
	650	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	651	cache all leaf nodes.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	652	"""
	653	self._transport = transport
	654	self._name = name
	655	self._size = size
	656	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	657	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	658	self._root_node = None
	659	# Default max size is 100,000 leave values
	660	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	661	if unlimited_cache:
	662	self._leaf_node_cache = {}
	663	self._internal_node_cache = {}
	664	else:
	665	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	666	# We use a FIFO here just to prevent possible blowout. However, a
	667	# 300k record btree has only 3k leaf nodes, and only 20 internal
	668	# nodes. A value of 100 scales to ~100100100 = 1M records.
	669	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	670	self._key_count = None
	671	self._row_lengths = None
	672	self._row_offsets = None # Start of each row, [-1] is the end
	673
	674	def __eq__(self, other):
	675	"""Equal when self and other were created with the same parameters."""
	676	return (
	677	type(self) == type(other) and
	678	self._transport == other._transport and
	679	self._name == other._name and
	680	self._size == other._size)
	681
	682	def __ne__(self, other):
	683	return not self.__eq__(other)
	684
3763.8.12 by John Arbash Meinel Code cleanup.	685	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	686	"""Read nodes and cache them in the lru.
	687
	688	The nodes list supplied is sorted and then read from disk, each node
	689	being inserted it into the _node_cache.
	690
	691	Note: Asking for more nodes than the _node_cache can contain will
	692	result in some of the results being immediately discarded, to prevent
	693	this an assertion is raised if more nodes are asked for than are
	694	cachable.
	695
	696	:return: A dict of {node_pos: node}
	697	"""
	698	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	699	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	700	for node_pos, node in self._read_nodes(sorted(nodes)):
	701	if node_pos == 0: # Special case
	702	self._root_node = node
	703	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	704	if start_of_leaves is None:
	705	start_of_leaves = self._row_offsets[-2]
	706	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	707	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	708	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	709	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	710	found[node_pos] = node
	711	return found
	712
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	713	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	714	"""Convert transport's recommended_page_size into btree pages.
	715
	716	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	717	pages fit in that length.
	718	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	719	recommended_read = self._transport.recommended_page_size()
	720	recommended_pages = int(math.ceil(recommended_read /
	721	float(_PAGE_SIZE)))
	722	return recommended_pages
	723
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	724	def _compute_total_pages_in_index(self):
	725	"""How many pages are in the index.
	726
	727	If we have read the header we will use the value stored there.
	728	Otherwise it will be computed based on the length of the index.
	729	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	730	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	731	raise AssertionError('_compute_total_pages_in_index should not be'
	732	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	733	if self._root_node is not None:
	734	# This is the number of pages as defined by the header
	735	return self._row_offsets[-1]
	736	# This is the number of pages as defined by the size of the index. They
	737	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	738	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	739	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	740
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	741	def _expand_offsets(self, offsets):
	742	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	743
	744	The idea is that we always want to make big-enough requests (like 64kB
	745	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	746	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	747	out what other pages we might want to read.
	748
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	749	See also doc/developers/btree_index_prefetch.txt for more details.
	750
	751	:param offsets: The offsets to be read
	752	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	753	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	754	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	755	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	756
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	757	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	758	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	759	if 'index' in debug.debug_flags:
	760	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	761	len(offsets), self._recommended_pages)
	762	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	763	if self._size is None:
	764	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	765	if 'index' in debug.debug_flags:
	766	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	767	return offsets
	768	total_pages = self._compute_total_pages_in_index()
	769	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	770	# If reading recommended_pages would read the rest of the index, just
	771	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	772	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	773	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	774	if cached_offsets:
	775	expanded = [x for x in xrange(total_pages)
	776	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	777	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	778	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	779	if 'index' in debug.debug_flags:
	780	trace.mutter(' reading all unread pages: %s', expanded)
	781	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	782
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	783	if self._root_node is None:
	784	# ATM on the first read of the root node of a large index, we don't
	785	# bother pre-reading any other pages. This is because the
	786	# likelyhood of actually reading interesting pages is very low.
	787	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	788	# a possible implementation when we are guessing that the second
	789	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	790	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	791	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	792	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	793	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	794	# We haven't read enough to justify expansion
	795	# If we are only going to read the root node, and 1 leaf node,
	796	# then it isn't worth expanding our request. Once we've read at
	797	# least 2 nodes, then we are probably doing a search, and we
	798	# start expanding our requests.
	799	if 'index' in debug.debug_flags:
	800	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	801	return offsets
	802	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	803	total_pages)
	804
	805	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	806	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	807	trace.mutter('expanded: %s', final_offsets)
	808	return final_offsets
	809
	810	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	811	"""Expand requests to neighbors until we have enough pages.
	812
	813	This is called from _expand_offsets after policy has determined that we
	814	want to expand.
	815	We only want to expand requests within a given layer. We cheat a little
	816	bit and assume all requests will be in the same layer. This is true
	817	given the current design, but if it changes this algorithm may perform
	818	oddly.
	819
	820	:param offsets: requested offsets
	821	:param cached_offsets: offsets for pages we currently have cached
	822	:return: A set() of offsets after expansion
	823	"""
	824	final_offsets = set(offsets)
	825	first = end = None
	826	new_tips = set(final_offsets)
	827	while len(final_offsets) < self._recommended_pages and new_tips:
	828	next_tips = set()
	829	for pos in new_tips:
	830	if first is None:
	831	first, end = self._find_layer_first_and_end(pos)
	832	previous = pos - 1
	833	if (previous > 0
	834	and previous not in cached_offsets
	835	and previous not in final_offsets
	836	and previous >= first):
	837	next_tips.add(previous)
	838	after = pos + 1
	839	if (after < total_pages
	840	and after not in cached_offsets
	841	and after not in final_offsets
	842	and after < end):
	843	next_tips.add(after)
	844	# This would keep us from going bigger than
	845	# recommended_pages by only expanding the first offsets.
	846	# However, if we are making a 'wide' request, it is
	847	# reasonable to expand all points equally.
	848	# if len(final_offsets) > recommended_pages:
	849	# break
	850	final_offsets.update(next_tips)
	851	new_tips = next_tips
	852	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	853
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	854	def external_references(self, ref_list_num):
	855	if self._root_node is None:
	856	self._get_root_node()
	857	if ref_list_num + 1 > self.node_ref_lists:
	858	raise ValueError('No ref list %d, index has %d ref lists'
	859	% (ref_list_num, self.node_ref_lists))
	860	keys = set()
	861	refs = set()
	862	for node in self.iter_all_entries():
	863	keys.add(node[1])
	864	refs.update(node[3][ref_list_num])
	865	return refs - keys
	866
3763.8.12 by John Arbash Meinel Code cleanup.	867	def _find_layer_first_and_end(self, offset):
	868	"""Find the start/stop nodes for the layer corresponding to offset.
	869
	870	:return: (first, end)
	871	first is the first node in this layer
	872	end is the first node of the next layer
	873	"""
	874	first = end = 0
	875	for roffset in self._row_offsets:
	876	first = end
	877	end = roffset
	878	if offset < roffset:
	879	break
	880	return first, end
	881
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	882	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	883	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	884	cached_offsets = set(self._internal_node_cache.keys())
	885	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	886	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	887	cached_offsets.add(0)
	888	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	889
	890	def _get_root_node(self):
	891	if self._root_node is None:
	892	# We may not have a root node yet
	893	self._get_internal_nodes([0])
	894	return self._root_node
	895
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	896	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	897	found = {}
	898	needed = []
	899	for idx in node_indexes:
	900	if idx == 0 and self._root_node is not None:
	901	found[0] = self._root_node
	902	continue
	903	try:
	904	found[idx] = cache[idx]
	905	except KeyError:
	906	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	907	if not needed:
	908	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	909	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	910	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	911	return found
	912
	913	def _get_internal_nodes(self, node_indexes):
	914	"""Get a node, from cache or disk.
	915
	916	After getting it, the node will be cached.
	917	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	918	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	919
3805.4.6 by John Arbash Meinel refactor for clarity.	920	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	921	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	922	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	923	for node in nodes.itervalues():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	924	for key, value in node.keys.iteritems():
	925	if key in self._leaf_value_cache:
	926	# Don't add the rest of the keys, we've seen this node
	927	# before.
	928	break
	929	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	930
	931	def _get_leaf_nodes(self, node_indexes):
	932	"""Get a bunch of nodes, from cache or disk."""
	933	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	934	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	935	return found
	936
	937	def iter_all_entries(self):
	938	"""Iterate over all keys within the index.
	939
	940	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	941	The former tuple is used when there are no reference lists in the
	942	index, making the API compatible with simple key:value index types.
	943	There is no defined order for the result iteration - it will be in
	944	the most efficient order for the index.
	945	"""
	946	if 'evil' in debug.debug_flags:
	947	trace.mutter_callsite(3,
	948	"iter_all_entries scales with size of history.")
	949	if not self.key_count():
	950	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	951	if self._row_offsets[-1] == 1:
	952	# There is only the root node, and we read that via key_count()
	953	if self.node_ref_lists:
	954	for key, (value, refs) in sorted(self._root_node.keys.items()):
	955	yield (self, key, value, refs)
	956	else:
	957	for key, (value, refs) in sorted(self._root_node.keys.items()):
	958	yield (self, key, value)
	959	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	960	start_of_leaves = self._row_offsets[-2]
	961	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	962	needed_offsets = range(start_of_leaves, end_of_leaves)
	963	if needed_offsets == [0]:
	964	# Special case when we only have a root node, as we have already
	965	# read everything
	966	nodes = [(0, self._root_node)]
	967	else:
	968	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	969	# We iterate strictly in-order so that we can use this function
	970	# for spilling index builds to disk.
	971	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	972	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	973	for key, (value, refs) in sorted(node.keys.items()):
	974	yield (self, key, value, refs)
	975	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	976	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	977	for key, (value, refs) in sorted(node.keys.items()):
	978	yield (self, key, value)
	979
	980	@staticmethod
	981	def _multi_bisect_right(in_keys, fixed_keys):
	982	"""Find the positions where each 'in_key' would fit in fixed_keys.
	983
	984	This is equivalent to doing "bisect_right" on each in_key into
	985	fixed_keys
	986
	987	:param in_keys: A sorted list of keys to match with fixed_keys
	988	:param fixed_keys: A sorted list of keys to match against
	989	:return: A list of (integer position, [key list]) tuples.
	990	"""
	991	if not in_keys:
	992	return []
	993	if not fixed_keys:
	994	# no pointers in the fixed_keys list, which means everything must
	995	# fall to the left.
	996	return [(0, in_keys)]
	997
	998	# TODO: Iterating both lists will generally take M + N steps
	999	# Bisecting each key will generally take M * log2 N steps.
	1000	# If we had an efficient way to compare, we could pick the method
	1001	# based on which has the fewer number of steps.
	1002	# There is also the argument that bisect_right is a compiled
	1003	# function, so there is even more to be gained.
	1004	# iter_steps = len(in_keys) + len(fixed_keys)
	1005	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1006	if len(in_keys) == 1: # Bisect will always be faster for M = 1
	1007	return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
	1008	# elif bisect_steps < iter_steps:
	1009	# offsets = {}
	1010	# for key in in_keys:
	1011	# offsets.setdefault(bisect_right(fixed_keys, key),
	1012	# []).append(key)
	1013	# return [(o, offsets[o]) for o in sorted(offsets)]
	1014	in_keys_iter = iter(in_keys)
	1015	fixed_keys_iter = enumerate(fixed_keys)
	1016	cur_in_key = in_keys_iter.next()
	1017	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1018
	1019	class InputDone(Exception): pass
	1020	class FixedDone(Exception): pass
	1021
	1022	output = []
	1023	cur_out = []
	1024
	1025	# TODO: Another possibility is that rather than iterating on each side,
	1026	# we could use a combination of bisecting and iterating. For
	1027	# example, while cur_in_key < fixed_key, bisect to find its
	1028	# point, then iterate all matching keys, then bisect (restricted
	1029	# to only the remainder) for the next one, etc.
	1030	try:
	1031	while True:
	1032	if cur_in_key < cur_fixed_key:
	1033	cur_keys = []
	1034	cur_out = (cur_fixed_offset, cur_keys)
	1035	output.append(cur_out)
	1036	while cur_in_key < cur_fixed_key:
	1037	cur_keys.append(cur_in_key)
	1038	try:
	1039	cur_in_key = in_keys_iter.next()
	1040	except StopIteration:
1041	raise InputDone
1042	# At this point cur_in_key must be >= cur_fixed_key
1043	# step the cur_fixed_key until we pass the cur key, or walk off
1044	# the end
1045	while cur_in_key >= cur_fixed_key:
1046	try:
1047	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1048	except StopIteration:
1049	raise FixedDone
1050	except InputDone:
1051	# We consumed all of the input, nothing more to do
1052	pass
1053	except FixedDone:
1054	# There was some input left, but we consumed all of fixed, so we
1055	# have to add one more for the tail
1056	cur_keys = [cur_in_key]
1057	cur_keys.extend(in_keys_iter)
1058	cur_out = (len(fixed_keys), cur_keys)
1059	output.append(cur_out)
1060	return output
1061
4593.4.5 by John Arbash Meinel Start adding some tests.	1062	def _walk_through_internal_nodes(self, keys):
	1063	"""Take the given set of keys, and find the corresponding LeafNodes.
	1064
	1065	:param keys: An unsorted iterable of keys to search for
	1066	:return: (nodes, index_and_keys)
	1067	nodes is a dict mapping {index: LeafNode}
	1068	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1069	"""
	1070	# 6 seconds spent in miss_torture using the sorted() line.
	1071	# Even with out of order disk IO it seems faster not to sort it when
	1072	# large queries are being made.
	1073	keys_at_index = [(0, sorted(keys))]
	1074
	1075	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1076	node_indexes = [idx for idx, s_keys in keys_at_index]
	1077	nodes = self._get_internal_nodes(node_indexes)
	1078
	1079	next_nodes_and_keys = []
	1080	for node_index, sub_keys in keys_at_index:
	1081	node = nodes[node_index]
	1082	positions = self._multi_bisect_right(sub_keys, node.keys)
	1083	node_offset = next_row_start + node.offset
	1084	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1085	for pos, s_keys in positions])
	1086	keys_at_index = next_nodes_and_keys
	1087	# We should now be at the _LeafNodes
	1088	node_indexes = [idx for idx, s_keys in keys_at_index]
	1089
	1090	# TODO: We may not want to always read all the nodes in one
	1091	# big go. Consider setting a max size on this.
	1092	nodes = self._get_leaf_nodes(node_indexes)
	1093	return nodes, keys_at_index
	1094
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1095	def iter_entries(self, keys):
	1096	"""Iterate over keys within the index.
	1097
	1098	:param keys: An iterable providing the keys to be retrieved.
	1099	:return: An iterable as per iter_all_entries, but restricted to the
	1100	keys supplied. No additional keys will be returned, and every
	1101	key supplied that is in the index will be returned.
	1102	"""
	1103	# 6 seconds spent in miss_torture using the sorted() line.
	1104	# Even with out of order disk IO it seems faster not to sort it when
	1105	# large queries are being made.
	1106	# However, now that we are doing multi-way bisecting, we need the keys
	1107	# in sorted order anyway. We could change the multi-way code to not
	1108	# require sorted order. (For example, it bisects for the first node,
	1109	# does an in-order search until a key comes before the current point,
	1110	# which it then bisects for, etc.)
	1111	keys = frozenset(keys)
	1112	if not keys:
	1113	return
	1114
	1115	if not self.key_count():
	1116	return
	1117
	1118	needed_keys = []
	1119	if self._leaf_value_cache is None:
	1120	needed_keys = keys
	1121	else:
	1122	for key in keys:
	1123	value = self._leaf_value_cache.get(key, None)
	1124	if value is not None:
	1125	# This key is known not to be here, skip it
	1126	value, refs = value
	1127	if self.node_ref_lists:
	1128	yield (self, key, value, refs)
	1129	else:
	1130	yield (self, key, value)
	1131	else:
	1132	needed_keys.append(key)
	1133
	1134	last_key = None
	1135	needed_keys = keys
	1136	if not needed_keys:
	1137	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1138	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1139	for node_index, sub_keys in nodes_and_keys:
	1140	if not sub_keys:
	1141	continue
	1142	node = nodes[node_index]
	1143	for next_sub_key in sub_keys:
	1144	if next_sub_key in node.keys:
	1145	value, refs = node.keys[next_sub_key]
	1146	if self.node_ref_lists:
	1147	yield (self, next_sub_key, value, refs)
	1148	else:
	1149	yield (self, next_sub_key, value)
	1150
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1151	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1152	"""Find the parent_map information for the set of keys.
	1153
	1154	This populates the parent_map dict and missing_keys set based on the
	1155	queried keys. It also can fill out an arbitrary number of parents that
	1156	it finds while searching for the supplied keys.
	1157
	1158	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1159	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1160
	1161	:param keys: A keys whose ancestry we want to return
	1162	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1163	:param ref_list_num: This index in the ref_lists is the parents we
	1164	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1165	:param parent_map: {key: parent_keys} for keys that are present in this
	1166	index. This may contain more entries than were in 'keys', that are
	1167	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1168	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1169	This may include parents that were not directly requested, but we
	1170	were able to determine that they are not present in this index.
	1171	:return: search_keys parents that were found but not queried to know
	1172	if they are missing or present. Callers can re-query this index for
	1173	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1174	"""
	1175	if not self.key_count():
	1176	# We use key_count() to trigger reading the root node and
	1177	# determining info about this BTreeGraphIndex
	1178	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1179	missing_keys.update(keys)
	1180	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1181	if ref_list_num >= self.node_ref_lists:
	1182	raise ValueError('No ref list %d, index has %d ref lists'
	1183	% (ref_list_num, self.node_ref_lists))
	1184
	1185	# The main trick we are trying to accomplish is that when we find a
	1186	# key listing its parents, we expect that the parent key is also likely
	1187	# to sit on the same page. Allowing us to expand parents quickly
	1188	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1189	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1190
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1191	# These are parent keys which could not be immediately resolved on the
	1192	# page where the child was present. Note that we may already be
	1193	# searching for that key, and it may actually be present [or known
	1194	# missing] on one of the other pages we are reading.
	1195	# TODO:
	1196	# We could try searching for them in the immediate previous or next
	1197	# page. If they occur "later" we could put them in a pending lookup
	1198	# set, and then for each node we read thereafter we could check to
	1199	# see if they are present.
	1200	# However, we don't know the impact of keeping this list of things
	1201	# that I'm going to search for every node I come across from here on
	1202	# out.
	1203	# It doesn't handle the case when the parent key is missing on a
	1204	# page that we don't read. So we already have to handle being
	1205	# re-entrant for that.
	1206	# Since most keys contain a date string, they are more likely to be
	1207	# found earlier in the file than later, but we would know that right
	1208	# away (key < min_key), and wouldn't keep searching it on every other
	1209	# page that we read.
	1210	# Mostly, it is an idea, one which should be benchmarked.
	1211	parents_not_on_page = set()
	1212
	1213	for node_index, sub_keys in nodes_and_keys:
	1214	if not sub_keys:
	1215	continue
	1216	# sub_keys is all of the keys we are looking for that should exist
	1217	# on this page, if they aren't here, then they won't be found
	1218	node = nodes[node_index]
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1219	node_keys = node.keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1220	parents_to_check = set()
	1221	for next_sub_key in sub_keys:
4593.4.5 by John Arbash Meinel Start adding some tests.	1222	if next_sub_key not in node_keys:
	1223	# This one is just not present in the index at all
	1224	missing_keys.add(next_sub_key)
	1225	else:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1226	value, refs = node_keys[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1227	parent_keys = refs[ref_list_num]
	1228	parent_map[next_sub_key] = parent_keys
	1229	parents_to_check.update(parent_keys)
	1230	# Don't look for things we've already found
	1231	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1232	# this can be used to test the benefit of having the check loop
	1233	# inlined.
	1234	# parents_not_on_page.update(parents_to_check)
	1235	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1236	while parents_to_check:
	1237	next_parents_to_check = set()
	1238	for key in parents_to_check:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1239	if key in node_keys:
	1240	value, refs = node_keys[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1241	parent_keys = refs[ref_list_num]
	1242	parent_map[key] = parent_keys
	1243	next_parents_to_check.update(parent_keys)
	1244	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1245	# This parent either is genuinely missing, or should be
	1246	# found on another page. Perf test whether it is better
	1247	# to check if this node should fit on this page or not.
	1248	# in the 'everything-in-one-pack' scenario, this not
	1249	# doing the check is 237ms vs 243ms.
	1250	# So slightly better, but I assume the standard 'lots
	1251	# of packs' is going to show a reasonable improvement
	1252	# from the check, because it avoids 'going around
	1253	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1254	# parents_not_on_page.add(key)
	1255	# Missing for some reason
	1256	if key < node.min_key:
	1257	# in the case of bzr.dev, 3.4k/5.3k misses are
	1258	# 'earlier' misses (65%)
	1259	parents_not_on_page.add(key)
	1260	elif key > node.max_key:
	1261	# This parent key would be present on a different
	1262	# LeafNode
	1263	parents_not_on_page.add(key)
	1264	else:
	1265	# assert key != node.min_key and key != node.max_key
	1266	# If it was going to be present, it would be on
	1267	# this page, so mark it missing.
	1268	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1269	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1270	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1271	# parents_not_on_page could have been found on a different page, or be
	1272	# known to be missing. So cull out everything that has already been
	1273	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1274	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1275	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1276	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1277
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1278	def iter_entries_prefix(self, keys):
	1279	"""Iterate over keys within the index using prefix matching.
	1280
	1281	Prefix matching is applied within the tuple of a key, not to within
	1282	the bytestring of each key element. e.g. if you have the keys ('foo',
	1283	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1284	only the former key is returned.
	1285
	1286	WARNING: Note that this method currently causes a full index parse
	1287	unconditionally (which is reasonably appropriate as it is a means for
	1288	thunking many small indices into one larger one and still supplies
	1289	iter_all_entries at the thunk layer).
	1290
	1291	:param keys: An iterable providing the key prefixes to be retrieved.
	1292	Each key prefix takes the form of a tuple the length of a key, but
	1293	with the last N elements 'None' rather than a regular bytestring.
	1294	The first element cannot be 'None'.
	1295	:return: An iterable as per iter_all_entries, but restricted to the
	1296	keys with a matching prefix to those supplied. No additional keys
	1297	will be returned, and every match that is in the index will be
	1298	returned.
	1299	"""
	1300	keys = sorted(set(keys))
	1301	if not keys:
	1302	return
	1303	# Load if needed to check key lengths
	1304	if self._key_count is None:
	1305	self._get_root_node()
	1306	# TODO: only access nodes that can satisfy the prefixes we are looking
	1307	# for. For now, to meet API usage (as this function is not used by
	1308	# current bzrlib) just suck the entire index and iterate in memory.
	1309	nodes = {}
	1310	if self.node_ref_lists:
	1311	if self._key_length == 1:
	1312	for _1, key, value, refs in self.iter_all_entries():
	1313	nodes[key] = value, refs
	1314	else:
	1315	nodes_by_key = {}
	1316	for _1, key, value, refs in self.iter_all_entries():
	1317	key_value = key, value, refs
	1318	# For a key of (foo, bar, baz) create
	1319	# _nodes_by_key[foo][bar][baz] = key_value
	1320	key_dict = nodes_by_key
	1321	for subkey in key[:-1]:
	1322	key_dict = key_dict.setdefault(subkey, {})
	1323	key_dict[key[-1]] = key_value
	1324	else:
	1325	if self._key_length == 1:
	1326	for _1, key, value in self.iter_all_entries():
	1327	nodes[key] = value
	1328	else:
	1329	nodes_by_key = {}
	1330	for _1, key, value in self.iter_all_entries():
	1331	key_value = key, value
	1332	# For a key of (foo, bar, baz) create
	1333	# _nodes_by_key[foo][bar][baz] = key_value
	1334	key_dict = nodes_by_key
	1335	for subkey in key[:-1]:
	1336	key_dict = key_dict.setdefault(subkey, {})
	1337	key_dict[key[-1]] = key_value
	1338	if self._key_length == 1:
	1339	for key in keys:
	1340	# sanity check
	1341	if key[0] is None:
1342	raise errors.BadIndexKey(key)
1343	if len(key) != self._key_length:
1344	raise errors.BadIndexKey(key)
1345	try:
1346	if self.node_ref_lists:
1347	value, node_refs = nodes[key]
1348	yield self, key, value, node_refs
1349	else:
1350	yield self, key, nodes[key]
1351	except KeyError:
1352	pass
1353	return
1354	for key in keys:
1355	# sanity check
1356	if key[0] is None:
1357	raise errors.BadIndexKey(key)
1358	if len(key) != self._key_length:
1359	raise errors.BadIndexKey(key)
1360	# find what it refers to:
1361	key_dict = nodes_by_key
1362	elements = list(key)
1363	# find the subdict whose contents should be returned.
1364	try:
1365	while len(elements) and elements[0] is not None:
1366	key_dict = key_dict[elements[0]]
1367	elements.pop(0)
1368	except KeyError:
1369	# a non-existant lookup.
1370	continue
1371	if len(elements):
1372	dicts = [key_dict]
1373	while dicts:
1374	key_dict = dicts.pop(-1)
1375	# can't be empty or would not exist
1376	item, value = key_dict.iteritems().next()
1377	if type(value) == dict:
1378	# push keys
1379	dicts.extend(key_dict.itervalues())
1380	else:
1381	# yield keys
1382	for value in key_dict.itervalues():
1383	# each value is the key:value:node refs tuple
1384	# ready to yield.
1385	yield (self, ) + value
1386	else:
1387	# the last thing looked up was a terminal element
1388	yield (self, ) + key_dict
1389
1390	def key_count(self):
1391	"""Return an estimate of the number of keys in this index.
1392
1393	For BTreeGraphIndex the estimate is exact as it is contained in the
1394	header.
1395	"""
1396	if self._key_count is None:
1397	self._get_root_node()
1398	return self._key_count
1399
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1400	def _compute_row_offsets(self):
	1401	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1402	offsets = []
	1403	row_offset = 0
	1404	for row in self._row_lengths:
	1405	offsets.append(row_offset)
	1406	row_offset += row
	1407	offsets.append(row_offset)
	1408	self._row_offsets = offsets
	1409
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1410	def _parse_header_from_bytes(self, bytes):
	1411	"""Parse the header from a region of bytes.
	1412
	1413	:param bytes: The data to parse.
	1414	:return: An offset, data tuple such as readv yields, for the unparsed
	1415	data. (which may be of length 0).
	1416	"""
	1417	signature = bytes[0:len(self._signature())]
	1418	if not signature == self._signature():
	1419	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1420	lines = bytes[len(self._signature()):].splitlines()
	1421	options_line = lines[0]
	1422	if not options_line.startswith(_OPTION_NODE_REFS):
	1423	raise errors.BadIndexOptions(self)
	1424	try:
	1425	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1426	except ValueError:
	1427	raise errors.BadIndexOptions(self)
	1428	options_line = lines[1]
	1429	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1430	raise errors.BadIndexOptions(self)
	1431	try:
	1432	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1433	except ValueError:
	1434	raise errors.BadIndexOptions(self)
	1435	options_line = lines[2]
	1436	if not options_line.startswith(_OPTION_LEN):
	1437	raise errors.BadIndexOptions(self)
	1438	try:
	1439	self._key_count = int(options_line[len(_OPTION_LEN):])
	1440	except ValueError:
	1441	raise errors.BadIndexOptions(self)
	1442	options_line = lines[3]
	1443	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1444	raise errors.BadIndexOptions(self)
	1445	try:
	1446	self._row_lengths = map(int, [length for length in
	1447	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1448	if len(length)])
	1449	except ValueError:
	1450	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1451	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1452
	1453	# calculate the bytes we have processed
	1454	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1455	return header_end, bytes[header_end:]
	1456
	1457	def _read_nodes(self, nodes):
	1458	"""Read some nodes from disk into the LRU cache.
	1459
	1460	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1461	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1462	supplied order. If possible doing sort() on the list before requesting
	1463	a read may improve performance.
	1464
	1465	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1466	:return: None
	1467	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1468	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1469	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1470	# list of (offset, length) regions of the file that should, evenually
	1471	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1472	ranges = []
	1473	for index in nodes:
	1474	offset = index * _PAGE_SIZE
	1475	size = _PAGE_SIZE
	1476	if index == 0:
	1477	# Root node - special case
	1478	if self._size:
	1479	size = min(_PAGE_SIZE, self._size)
	1480	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1481	# The only case where we don't know the size, is for very
	1482	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1483	bytes = self._transport.get_bytes(self._name)
	1484	self._size = len(bytes)
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1485	# the whole thing should be parsed out of 'bytes'
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1486	ranges.append((0, len(bytes)))
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1487	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1488	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1489	if offset > self._size:
	1490	raise AssertionError('tried to read past the end'
	1491	' of the file %s > %s'
	1492	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1493	size = min(size, self._size - offset)
	1494	ranges.append((offset, size))
	1495	if not ranges:
	1496	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1497	elif bytes is not None:
	1498	# already have the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1499	data_ranges = [(start, bytes[start:start+_PAGE_SIZE])
	1500	for start in xrange(0, len(bytes), _PAGE_SIZE)]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1501	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1502	data_ranges = self._transport.readv(self._name, ranges)
	1503	else:
	1504	data_ranges = []
	1505	for offset, size in ranges:
	1506	self._file.seek(offset)
	1507	data_ranges.append((offset, self._file.read(size)))
	1508	for offset, data in data_ranges:
	1509	if offset == 0:
	1510	# extract the header
	1511	offset, data = self._parse_header_from_bytes(data)
	1512	if len(data) == 0:
	1513	continue
	1514	bytes = zlib.decompress(data)
	1515	if bytes.startswith(_LEAF_FLAG):
	1516	node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
	1517	elif bytes.startswith(_INTERNAL_FLAG):
	1518	node = _InternalNode(bytes)
	1519	else:
	1520	raise AssertionError("Unknown node type for %r" % bytes)
	1521	yield offset / _PAGE_SIZE, node
	1522
	1523	def _signature(self):
	1524	"""The file signature for this index type."""
	1525	return _BTSIGNATURE
	1526
	1527	def validate(self):
	1528	"""Validate that everything in the index can be accessed."""
	1529	# just read and parse every node.
	1530	self._get_root_node()
	1531	if len(self._row_lengths) > 1:
	1532	start_node = self._row_offsets[1]
	1533	else:
	1534	# We shouldn't be reading anything anyway
	1535	start_node = 1
	1536	node_end = self._row_offsets[-1]
	1537	for node in self._read_nodes(range(start_node, node_end)):
	1538	pass
	1539
	1540
	1541	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1542	from bzrlib import _btree_serializer_pyx as _btree_serializer
4574.3.6 by Martin Pool More warnings when failing to load extensions	1543	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1544	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1545	from bzrlib import _btree_serializer_py as _btree_serializer