~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/btree_index.py

  • Committer: Robert Collins
  • Date: 2008-09-19 06:53:41 UTC
  • mto: (3696.5.1 commit-updates)
  • mto: This revision was merged to the branch mainline in revision 3741.
  • Revision ID: robertc@robertcollins.net-20080919065341-5t5w1p2gi926nfia
First cut - make it work - at updating the tree stat cache during commit.

Show diffs side-by-side

added added

removed removed

Lines of Context:
37
37
    trace,
38
38
    )
39
39
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
40
 
from bzrlib.osutils import basename, dirname
41
40
from bzrlib.transport import get_transport
42
41
 
43
42
 
53
52
# 4K per page: 4MB - 1000 entries
54
53
_NODE_CACHE_SIZE = 1000
55
54
 
56
 
leaf_value_hits = [0, 0]
57
 
internal_node_hits = [0, 0]
58
 
leaf_node_hits = [0, 0]
59
 
miss_attempts = 0  # Missed this entry while looking up
60
 
bisect_shortcut = [0, 0]
61
 
dupes = [0]
62
 
 
63
55
 
64
56
class _BuilderRow(object):
65
57
    """The stored state accumulated while writing out a row in the index.
85
77
            del byte_lines[-1]
86
78
            skipped_bytes = padding
87
79
        self.spool.writelines(byte_lines)
88
 
        if (self.spool.tell() + skipped_bytes) % _PAGE_SIZE != 0:
89
 
            raise AssertionError("incorrect node length")
 
80
        remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
 
81
        if remainder != 0:
 
82
            raise AssertionError("incorrect node length: %d, %d"
 
83
                                 % (self.spool.tell(), remainder))
90
84
        self.nodes += 1
91
85
        self.writer = None
92
86
 
142
136
            key_elements=key_elements)
143
137
        self._spill_at = spill_at
144
138
        self._backing_indices = []
 
139
        # A map of {key: (node_refs, value)}
 
140
        self._nodes = {}
 
141
        # Indicate it hasn't been built yet
 
142
        self._nodes_by_key = None
145
143
 
146
144
    def add_node(self, key, value, references=()):
147
145
        """Add a node to the index.
157
155
        :param value: The value to associate with the key. It may be any
158
156
            bytes as long as it does not contain \0 or \n.
159
157
        """
160
 
        index.GraphIndexBuilder.add_node(self, key, value, references=references)
 
158
        # we don't care about absent_references
 
159
        node_refs, _ = self._check_key_ref_value(key, references, value)
 
160
        if key in self._nodes:
 
161
            raise errors.BadIndexDuplicateKey(key, self)
 
162
        self._nodes[key] = (node_refs, value)
 
163
        self._keys.add(key)
 
164
        if self._nodes_by_key is not None and self._key_length > 1:
 
165
            self._update_nodes_by_key(key, value, node_refs)
161
166
        if len(self._keys) < self._spill_at:
162
167
            return
163
 
        iterators_to_combine = [iter(sorted(self._iter_mem_nodes()))]
 
168
        self._spill_mem_keys_to_disk()
 
169
 
 
170
    def _spill_mem_keys_to_disk(self):
 
171
        """Write the in memory keys down to disk to cap memory consumption.
 
172
 
 
173
        If we already have some keys written to disk, we will combine them so
 
174
        as to preserve the sorted order.  The algorithm for combining uses
 
175
        powers of two.  So on the first spill, write all mem nodes into a
 
176
        single index. On the second spill, combine the mem nodes with the nodes
 
177
        on disk to create a 2x sized disk index and get rid of the first index.
 
178
        On the third spill, create a single new disk index, which will contain
 
179
        the mem nodes, and preserve the existing 2x sized index.  On the fourth,
 
180
        combine mem with the first and second indexes, creating a new one of
 
181
        size 4x. On the fifth create a single new one, etc.
 
182
        """
 
183
        iterators_to_combine = [self._iter_mem_nodes()]
164
184
        pos = -1
165
185
        for pos, backing in enumerate(self._backing_indices):
166
186
            if backing is None:
170
190
        backing_pos = pos + 1
171
191
        new_backing_file, size = \
172
192
            self._write_nodes(self._iter_smallest(iterators_to_combine))
173
 
        new_backing = BTreeGraphIndex(
174
 
            get_transport(dirname(new_backing_file.name)),
175
 
            basename(new_backing_file.name), size)
 
193
        dir_path, base_name = osutils.split(new_backing_file.name)
 
194
        # Note: The transport here isn't strictly needed, because we will use
 
195
        #       direct access to the new_backing._file object
 
196
        new_backing = BTreeGraphIndex(get_transport(dir_path),
 
197
                                      base_name, size)
176
198
        # GC will clean up the file
177
199
        new_backing._file = new_backing_file
178
200
        if len(self._backing_indices) == backing_pos:
182
204
            self._backing_indices[pos] = None
183
205
        self._keys = set()
184
206
        self._nodes = {}
185
 
        self._nodes_by_key = {}
 
207
        self._nodes_by_key = None
186
208
 
187
209
    def add_nodes(self, nodes):
188
210
        """Add nodes to the index.
198
220
 
199
221
    def _iter_mem_nodes(self):
200
222
        """Iterate over the nodes held in memory."""
 
223
        nodes = self._nodes
201
224
        if self.reference_lists:
202
 
            for key, (absent, references, value) in self._nodes.iteritems():
203
 
                if not absent:
204
 
                    yield self, key, value, references
 
225
            for key in sorted(nodes):
 
226
                references, value = nodes[key]
 
227
                yield self, key, value, references
205
228
        else:
206
 
            for key, (absent, references, value) in self._nodes.iteritems():
207
 
                if not absent:
208
 
                    yield self, key, value
 
229
            for key in sorted(nodes):
 
230
                references, value = nodes[key]
 
231
                yield self, key, value
209
232
 
210
233
    def _iter_smallest(self, iterators_to_combine):
211
234
        if len(iterators_to_combine) == 1:
365
388
            copied_len = osutils.pumpfile(row.spool, result)
366
389
            if copied_len != (row.nodes - 1) * _PAGE_SIZE:
367
390
                if type(row) != _LeafBuilderRow:
368
 
                    raise AssertionError("Not enough data copied")
 
391
                    raise AssertionError("Incorrect amount of data copied"
 
392
                        " expected: %d, got: %d"
 
393
                        % ((row.nodes - 1) * _PAGE_SIZE,
 
394
                           copied_len))
369
395
        result.flush()
370
396
        size = result.tell()
371
397
        result.seek(0)
391
417
                "iter_all_entries scales with size of history.")
392
418
        # Doing serial rather than ordered would be faster; but this shouldn't
393
419
        # be getting called routinely anyway.
394
 
        iterators = [iter(sorted(self._iter_mem_nodes()))]
 
420
        iterators = [self._iter_mem_nodes()]
395
421
        for backing in self._backing_indices:
396
422
            if backing is not None:
397
423
                iterators.append(backing.iter_all_entries())
411
437
        if self.reference_lists:
412
438
            for key in keys.intersection(self._keys):
413
439
                node = self._nodes[key]
414
 
                if not node[0]:
415
 
                    yield self, key, node[2], node[1]
 
440
                yield self, key, node[1], node[0]
416
441
        else:
417
442
            for key in keys.intersection(self._keys):
418
443
                node = self._nodes[key]
419
 
                if not node[0]:
420
 
                    yield self, key, node[2]
 
444
                yield self, key, node[1]
421
445
        keys.difference_update(self._keys)
422
446
        for backing in self._backing_indices:
423
447
            if backing is None:
466
490
                    node = self._nodes[key]
467
491
                except KeyError:
468
492
                    continue
469
 
                if node[0]:
470
 
                    continue
471
493
                if self.reference_lists:
472
 
                    yield self, key, node[2], node[1]
 
494
                    yield self, key, node[1], node[0]
473
495
                else:
474
 
                    yield self, key, node[2]
 
496
                    yield self, key, node[1]
475
497
            return
476
498
        for key in keys:
477
499
            # sanity check
480
502
            if len(key) != self._key_length:
481
503
                raise errors.BadIndexKey(key)
482
504
            # find what it refers to:
483
 
            key_dict = self._nodes_by_key
 
505
            key_dict = self._get_nodes_by_key()
484
506
            elements = list(key)
485
507
            # find the subdict to return
486
508
            try:
506
528
            else:
507
529
                yield (self, ) + key_dict
508
530
 
 
531
    def _get_nodes_by_key(self):
 
532
        if self._nodes_by_key is None:
 
533
            nodes_by_key = {}
 
534
            if self.reference_lists:
 
535
                for key, (references, value) in self._nodes.iteritems():
 
536
                    key_dict = nodes_by_key
 
537
                    for subkey in key[:-1]:
 
538
                        key_dict = key_dict.setdefault(subkey, {})
 
539
                    key_dict[key[-1]] = key, value, references
 
540
            else:
 
541
                for key, (references, value) in self._nodes.iteritems():
 
542
                    key_dict = nodes_by_key
 
543
                    for subkey in key[:-1]:
 
544
                        key_dict = key_dict.setdefault(subkey, {})
 
545
                    key_dict[key[-1]] = key, value
 
546
            self._nodes_by_key = nodes_by_key
 
547
        return self._nodes_by_key
 
548
 
509
549
    def key_count(self):
510
550
        """Return an estimate of the number of keys in this index.
511
551
 
622
662
            found[node_pos] = node
623
663
        return found
624
664
 
625
 
    def _get_nodes(self, cache, node_indexes, counter):
 
665
    def _get_nodes(self, cache, node_indexes):
626
666
        found = {}
627
667
        needed = []
628
668
        for idx in node_indexes:
631
671
                continue
632
672
            try:
633
673
                found[idx] = cache[idx]
634
 
                counter[0] += 1
635
674
            except KeyError:
636
675
                needed.append(idx)
637
 
                counter[1] += 1
638
676
        found.update(self._cache_nodes(needed, cache))
639
677
        return found
640
678
 
643
681
 
644
682
        After getting it, the node will be cached.
645
683
        """
646
 
        return self._get_nodes(self._internal_node_cache, node_indexes,
647
 
                               internal_node_hits)
 
684
        return self._get_nodes(self._internal_node_cache, node_indexes)
648
685
 
649
686
    def _get_leaf_nodes(self, node_indexes):
650
687
        """Get a bunch of nodes, from cache or disk."""
651
 
        found = self._get_nodes(self._leaf_node_cache, node_indexes,
652
 
                                leaf_node_hits)
 
688
        found = self._get_nodes(self._leaf_node_cache, node_indexes)
653
689
        if self._leaf_value_cache is not None:
654
690
            for node in found.itervalues():
655
691
                for key, value in node.keys.iteritems():
715
751
        # iter_steps = len(in_keys) + len(fixed_keys)
716
752
        # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
717
753
        if len(in_keys) == 1: # Bisect will always be faster for M = 1
718
 
            bisect_shortcut[0] += 1
719
754
            return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
720
755
        # elif bisect_steps < iter_steps:
721
 
        #     bisect_shortcut[0] += len(in_keys)
722
756
        #     offsets = {}
723
757
        #     for key in in_keys:
724
758
        #         offsets.setdefault(bisect_right(fixed_keys, key),
725
759
        #                            []).append(key)
726
760
        #     return [(o, offsets[o]) for o in sorted(offsets)]
727
 
        else:
728
 
            bisect_shortcut[1] += len(in_keys)
729
761
        in_keys_iter = iter(in_keys)
730
762
        fixed_keys_iter = enumerate(fixed_keys)
731
763
        cur_in_key = in_keys_iter.next()
794
826
        if not keys:
795
827
            return
796
828
 
797
 
        global leaf_value_hits, miss_attempts, dupes
798
829
        if not self.key_count():
799
830
            return
800
831
 
805
836
            for key in keys:
806
837
                value = self._leaf_value_cache.get(key, None)
807
838
                if value is not None:
808
 
                    leaf_value_hits[0] += 1
809
839
                    # This key is known not to be here, skip it
810
840
                    value, refs = value
811
841
                    if self.node_ref_lists:
813
843
                    else:
814
844
                        yield (self, key, value)
815
845
                else:
816
 
                    leaf_value_hits[1] += 1
817
846
                    needed_keys.append(key)
818
847
 
819
848
        last_key = None
857
886
                        yield (self, next_sub_key, value, refs)
858
887
                    else:
859
888
                        yield (self, next_sub_key, value)
860
 
                else:
861
 
                    miss_attempts += 1
862
889
 
863
890
    def iter_entries_prefix(self, keys):
864
891
        """Iterate over keys within the index using prefix matching.