13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
18
"""B+Tree indices"""
22
20
from bisect import bisect_right
23
from copy import deepcopy
180
177
combine mem with the first and second indexes, creating a new one of
181
178
size 4x. On the fifth create a single new one, etc.
183
iterators_to_combine = [self._iter_mem_nodes()]
185
for pos, backing in enumerate(self._backing_indices):
189
iterators_to_combine.append(backing.iter_all_entries())
190
backing_pos = pos + 1
191
new_backing_file, size = \
192
self._write_nodes(self._iter_smallest(iterators_to_combine))
180
if self._combine_backing_indices:
181
(new_backing_file, size,
182
backing_pos) = self._spill_mem_keys_and_combine()
184
new_backing_file, size = self._spill_mem_keys_without_combining()
193
185
dir_path, base_name = osutils.split(new_backing_file.name)
194
186
# Note: The transport here isn't strictly needed, because we will use
195
187
# direct access to the new_backing._file object
198
190
# GC will clean up the file
199
191
new_backing._file = new_backing_file
200
if len(self._backing_indices) == backing_pos:
201
self._backing_indices.append(None)
202
self._backing_indices[backing_pos] = new_backing
203
for pos in range(backing_pos):
204
self._backing_indices[pos] = None
192
if self._combine_backing_indices:
193
if len(self._backing_indices) == backing_pos:
194
self._backing_indices.append(None)
195
self._backing_indices[backing_pos] = new_backing
196
for backing_pos in range(backing_pos):
197
self._backing_indices[backing_pos] = None
199
self._backing_indices.append(new_backing)
205
200
self._keys = set()
207
202
self._nodes_by_key = None
204
def _spill_mem_keys_without_combining(self):
205
return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
207
def _spill_mem_keys_and_combine(self):
208
iterators_to_combine = [self._iter_mem_nodes()]
210
for pos, backing in enumerate(self._backing_indices):
214
iterators_to_combine.append(backing.iter_all_entries())
215
backing_pos = pos + 1
216
new_backing_file, size = \
217
self._write_nodes(self._iter_smallest(iterators_to_combine),
218
allow_optimize=False)
219
return new_backing_file, size, backing_pos
209
221
def add_nodes(self, nodes):
210
222
"""Add nodes to the index.
262
274
except StopIteration:
263
275
current_values[pos] = None
265
def _add_key(self, string_key, line, rows):
277
def _add_key(self, string_key, line, rows, allow_optimize=True):
266
278
"""Add a key to the current chunk.
268
280
:param string_key: The key to add.
269
281
:param line: The fully serialised key and value.
282
:param allow_optimize: If set to False, prevent setting the optimize
283
flag when writing out. This is used by the _spill_mem_keys_to_disk
271
286
if rows[-1].writer is None:
272
287
# opening a new leaf chunk;
277
292
length = _PAGE_SIZE
278
293
if internal_row.nodes == 0:
279
294
length -= _RESERVED_HEADER_BYTES # padded
296
optimize_for_size = self._optimize_for_size
298
optimize_for_size = False
280
299
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
281
optimize_for_size=self._optimize_for_size)
300
optimize_for_size=optimize_for_size)
282
301
internal_row.writer.write(_INTERNAL_FLAG)
283
302
internal_row.writer.write(_INTERNAL_OFFSET +
284
303
str(rows[pos + 1].nodes) + "\n")
297
316
for row in reversed(rows[:-1]):
298
317
# Mark the start of the next node in the node above. If it
299
# doesn't fit then propogate upwards until we find one that
318
# doesn't fit then propagate upwards until we find one that
300
319
# it does fit into.
301
320
if row.writer.write(key_line):
302
321
row.finish_node()
322
341
new_row.writer.write(_INTERNAL_OFFSET +
323
342
str(rows[1].nodes - 1) + "\n")
324
343
new_row.writer.write(key_line)
325
self._add_key(string_key, line, rows)
344
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
327
def _write_nodes(self, node_iterator):
346
def _write_nodes(self, node_iterator, allow_optimize=True):
328
347
"""Write node_iterator out as a B+Tree.
330
349
:param node_iterator: An iterator of sorted nodes. Each node should
331
350
match the output given by iter_all_entries.
351
:param allow_optimize: If set to False, prevent setting the optimize
352
flag when writing out. This is used by the _spill_mem_keys_to_disk
332
354
:return: A file handle for a temporary file containing a B+Tree for
344
366
self.row_lengths = []
345
367
# Loop over all nodes adding them to the bottom row
346
368
# (rows[-1]). When we finish a chunk in a row,
347
# propogate the key that didn't fit (comes after the chunk) to the
369
# propagate the key that didn't fit (comes after the chunk) to the
348
370
# row above, transitively.
349
371
for node in node_iterator:
350
372
if key_count == 0:
354
376
string_key, line = _btree_serializer._flatten_node(node,
355
377
self.reference_lists)
356
self._add_key(string_key, line, rows)
378
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
357
379
for row in reversed(rows):
358
380
pad = (type(row) != _LeafBuilderRow)
359
381
row.finish_node(pad=pad)
360
result = tempfile.NamedTemporaryFile()
382
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
361
383
lines = [_BTSIGNATURE]
362
384
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
363
385
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
405
427
def iter_all_entries(self):
406
428
"""Iterate over all keys within the index
408
:return: An iterable of (index, key, reference_lists, value). There is no
409
defined order for the result iteration - it will be in the most
430
:return: An iterable of (index, key, value, reference_lists). There is
431
no defined order for the result iteration - it will be in the most
410
432
efficient order for the index (in this case dictionary hash order).
412
434
if 'evil' in debug.debug_flags:
564
586
class _LeafNode(object):
565
587
"""A leaf node for a serialised B+Tree index."""
589
__slots__ = ('keys',)
567
591
def __init__(self, bytes, key_length, ref_list_length):
568
592
"""Parse bytes to create a leaf node object."""
569
593
# splitlines mangles the \r delimiters.. don't use it.
574
598
class _InternalNode(object):
575
599
"""An internal node for a serialised B+Tree index."""
601
__slots__ = ('keys', 'offset')
577
603
def __init__(self, bytes):
578
604
"""Parse bytes to create an internal node object."""
579
605
# splitlines mangles the \r delimiters.. don't use it.
616
642
# Default max size is 100,000 leave values
617
643
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
618
644
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
619
self._internal_node_cache = lru_cache.LRUCache()
645
# We could limit this, but even a 300k record btree has only 3k leaf
646
# nodes, and only 20 internal nodes. So the default of 100 nodes in an
647
# LRU would mean we always cache everything anyway, no need to pay the
649
self._internal_node_cache = fifo_cache.FIFOCache(100)
620
650
self._key_count = None
621
651
self._row_lengths = None
622
652
self._row_offsets = None # Start of each row, [-1] is the end
801
831
new_tips = next_tips
802
832
return final_offsets
834
def external_references(self, ref_list_num):
835
if self._root_node is None:
836
self._get_root_node()
837
if ref_list_num + 1 > self.node_ref_lists:
838
raise ValueError('No ref list %d, index has %d ref lists'
839
% (ref_list_num, self.node_ref_lists))
842
for node in self.iter_all_entries():
844
refs.update(node[3][ref_list_num])
804
847
def _find_layer_first_and_end(self, offset):
805
848
"""Find the start/stop nodes for the layer corresponding to offset.