~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

  • Committer: Canonical.com Patch Queue Manager
  • Date: 2008-03-15 17:44:41 UTC
  • mfrom: (3224.1.29 annotate_finish)
  • Revision ID: pqm@pqm.ubuntu.com-20080315174441-l8xpw6femn0syal1
(jam) Tweak annotate performance for pack repositories,
        improving speed and decreasing memory consumption.

Show diffs side-by-side

added added

removed removed

Lines of Context:
74
74
lazy_import(globals(), """
75
75
from bzrlib import (
76
76
    annotate,
 
77
    graph as _mod_graph,
77
78
    lru_cache,
78
79
    pack,
79
80
    trace,
134
135
class KnitContent(object):
135
136
    """Content of a knit version to which deltas can be applied."""
136
137
 
 
138
    def __init__(self):
 
139
        self._should_strip_eol = False
 
140
 
137
141
    def annotate(self):
138
142
        """Return a list of (origin, text) tuples."""
139
143
        return list(self.annotate_iter())
142
146
        """Apply delta to this object to become new_version_id."""
143
147
        raise NotImplementedError(self.apply_delta)
144
148
 
 
149
    def cleanup_eol(self, copy_on_mutate=True):
 
150
        if self._should_strip_eol:
 
151
            if copy_on_mutate:
 
152
                self._lines = self._lines[:]
 
153
            self.strip_last_line_newline()
 
154
 
145
155
    def line_delta_iter(self, new_lines):
146
156
        """Generate line-based delta from this content to new_lines."""
147
157
        new_texts = new_lines.text()
187
197
    """Annotated content."""
188
198
 
189
199
    def __init__(self, lines):
 
200
        KnitContent.__init__(self)
190
201
        self._lines = lines
191
202
 
192
203
    def annotate_iter(self):
204
215
    def strip_last_line_newline(self):
205
216
        line = self._lines[-1][1].rstrip('\n')
206
217
        self._lines[-1] = (self._lines[-1][0], line)
 
218
        self._should_strip_eol = False
207
219
 
208
220
    def text(self):
209
221
        try:
210
 
            return [text for origin, text in self._lines]
 
222
            lines = [text for origin, text in self._lines]
211
223
        except ValueError, e:
212
224
            # most commonly (only?) caused by the internal form of the knit
213
225
            # missing annotation information because of a bug - see thread
216
228
                "line in annotated knit missing annotation information: %s"
217
229
                % (e,))
218
230
 
 
231
        if self._should_strip_eol:
 
232
            anno, line = lines[-1]
 
233
            lines[-1] = (anno, line.rstrip('\n'))
 
234
        return lines
 
235
 
219
236
    def copy(self):
220
237
        return AnnotatedKnitContent(self._lines[:])
221
238
 
229
246
    """
230
247
 
231
248
    def __init__(self, lines, version_id):
 
249
        KnitContent.__init__(self)
232
250
        self._lines = lines
233
251
        self._version_id = version_id
234
252
 
251
269
 
252
270
    def strip_last_line_newline(self):
253
271
        self._lines[-1] = self._lines[-1].rstrip('\n')
 
272
        self._should_strip_eol = False
254
273
 
255
274
    def text(self):
256
 
        return self._lines
257
 
 
258
 
 
259
 
class KnitAnnotateFactory(object):
 
275
        lines = self._lines
 
276
        if self._should_strip_eol:
 
277
            lines = lines[:]
 
278
            lines[-1] = lines[-1].rstrip('\n')
 
279
        return lines
 
280
 
 
281
 
 
282
class _KnitFactory(object):
 
283
    """Base class for common Factory functions."""
 
284
 
 
285
    def parse_record(self, version_id, record, record_details,
 
286
                     base_content, copy_base_content=True):
 
287
        """Parse a record into a full content object.
 
288
 
 
289
        :param version_id: The official version id for this content
 
290
        :param record: The data returned by read_records_iter()
 
291
        :param record_details: Details about the record returned by
 
292
            get_build_details
 
293
        :param base_content: If get_build_details returns a compression_parent,
 
294
            you must return a base_content here, else use None
 
295
        :param copy_base_content: When building from the base_content, decide
 
296
            you can either copy it and return a new object, or modify it in
 
297
            place.
 
298
        :return: (content, delta) A Content object and possibly a line-delta,
 
299
            delta may be None
 
300
        """
 
301
        method, noeol = record_details
 
302
        if method == 'line-delta':
 
303
            assert base_content is not None
 
304
            if copy_base_content:
 
305
                content = base_content.copy()
 
306
            else:
 
307
                content = base_content
 
308
            delta = self.parse_line_delta(record, version_id)
 
309
            content.apply_delta(delta, version_id)
 
310
        else:
 
311
            content = self.parse_fulltext(record, version_id)
 
312
            delta = None
 
313
        content._should_strip_eol = noeol
 
314
        return (content, delta)
 
315
 
 
316
 
 
317
class KnitAnnotateFactory(_KnitFactory):
260
318
    """Factory for creating annotated Content objects."""
261
319
 
262
320
    annotated = True
368
426
        return content.annotate_iter()
369
427
 
370
428
 
371
 
class KnitPlainFactory(object):
 
429
class KnitPlainFactory(_KnitFactory):
372
430
    """Factory for creating plain Content objects."""
373
431
 
374
432
    annotated = False
426
484
        return out
427
485
 
428
486
    def annotate_iter(self, knit, version_id):
429
 
        return annotate_knit(knit, version_id)
 
487
        annotator = _KnitAnnotator(knit)
 
488
        return iter(annotator.annotate(version_id))
430
489
 
431
490
 
432
491
def make_empty_knit(transport, relpath):
807
866
            factory = KnitAnnotateFactory()
808
867
        else:
809
868
            raise errors.KnitDataStreamUnknown(format)
810
 
        index = _StreamIndex(data_list)
 
869
        index = _StreamIndex(data_list, self._index)
811
870
        access = _StreamAccess(reader_callable, index, self, factory)
812
871
        return KnitVersionedFile(self.filename, self.transport,
813
872
            factory=factory, index=index, access_method=access)
874
933
 
875
934
        This data is intended to be used for retrieving the knit records.
876
935
 
877
 
        A dict of version_id to (method, index_memo, next) is
 
936
        A dict of version_id to (record_details, index_memo, next, parents) is
878
937
        returned.
879
938
        method is the way referenced data should be applied.
880
 
        data_pos is the position of the data in the knit.
881
 
        data_size is the size of the data in the knit.
 
939
        index_memo is the handle to pass to the data access to actually get the
 
940
            data
882
941
        next is the build-parent of the version, or None for fulltexts.
 
942
        parents is the version_ids of the parents of this version
883
943
        """
884
944
        component_data = {}
885
945
        pending_components = version_ids
886
946
        while pending_components:
887
947
            build_details = self._index.get_build_details(pending_components)
 
948
            current_components = set(pending_components)
888
949
            pending_components = set()
889
 
            for version_id, details in build_details.items():
890
 
                method, index_memo, compression_parent = details
 
950
            for version_id, details in build_details.iteritems():
 
951
                (index_memo, compression_parent, parents,
 
952
                 record_details) = details
 
953
                method = record_details[0]
891
954
                if compression_parent is not None:
892
955
                    pending_components.add(compression_parent)
893
 
                component_data[version_id] = details
 
956
                component_data[version_id] = (record_details, index_memo,
 
957
                                              compression_parent)
 
958
            missing = current_components.difference(build_details)
 
959
            if missing:
 
960
                raise errors.RevisionNotPresent(missing.pop(), self.filename)
894
961
        return component_data
895
962
       
896
963
    def _get_content(self, version_id, parent_texts={}):
1034
1101
    def _get_record_map(self, version_ids):
1035
1102
        """Produce a dictionary of knit records.
1036
1103
        
1037
 
        The keys are version_ids, the values are tuples of (method, content,
1038
 
        digest, next).
1039
 
        method is the way the content should be applied.  
1040
 
        content is a KnitContent object.
1041
 
        digest is the SHA1 digest of this version id after all steps are done
1042
 
        next is the build-parent of the version, i.e. the leftmost ancestor.
1043
 
        If the method is fulltext, next will be None.
 
1104
        :return: {version_id:(record, record_details, digest, next)}
 
1105
            record
 
1106
                data returned from read_records
 
1107
            record_details
 
1108
                opaque information to pass to parse_record
 
1109
            digest
 
1110
                SHA1 digest of the full text after all steps are done
 
1111
            next
 
1112
                build-parent of the version, i.e. the leftmost ancestor.
 
1113
                Will be None if the record is not a delta.
1044
1114
        """
1045
1115
        position_map = self._get_components_positions(version_ids)
1046
 
        # c = component_id, m = method, i_m = index_memo, n = next
1047
 
        records = [(c, i_m) for c, (m, i_m, n) in position_map.iteritems()]
 
1116
        # c = component_id, r = record_details, i_m = index_memo, n = next
 
1117
        records = [(c, i_m) for c, (r, i_m, n)
 
1118
                             in position_map.iteritems()]
1048
1119
        record_map = {}
1049
 
        for component_id, content, digest in \
 
1120
        for component_id, record, digest in \
1050
1121
                self._data.read_records_iter(records):
1051
 
            method, index_memo, next = position_map[component_id]
1052
 
            record_map[component_id] = method, content, digest, next
1053
 
                          
 
1122
            (record_details, index_memo, next) = position_map[component_id]
 
1123
            record_map[component_id] = record, record_details, digest, next
 
1124
 
1054
1125
        return record_map
1055
1126
 
1056
1127
    def get_text(self, version_id):
1091
1162
            components = []
1092
1163
            cursor = version_id
1093
1164
            while cursor is not None:
1094
 
                method, data, digest, next = record_map[cursor]
1095
 
                components.append((cursor, method, data, digest))
 
1165
                record, record_details, digest, next = record_map[cursor]
 
1166
                components.append((cursor, record, record_details, digest))
1096
1167
                if cursor in content_map:
1097
1168
                    break
1098
1169
                cursor = next
1099
1170
 
1100
1171
            content = None
1101
 
            for component_id, method, data, digest in reversed(components):
 
1172
            for (component_id, record, record_details,
 
1173
                 digest) in reversed(components):
1102
1174
                if component_id in content_map:
1103
1175
                    content = content_map[component_id]
1104
1176
                else:
1105
 
                    if method == 'fulltext':
1106
 
                        assert content is None
1107
 
                        content = self.factory.parse_fulltext(data, version_id)
1108
 
                    elif method == 'line-delta':
1109
 
                        delta = self.factory.parse_line_delta(data, version_id)
1110
 
                        if multiple_versions:
1111
 
                            # only doing this when we want multiple versions
1112
 
                            # output avoids list copies - which reference and
1113
 
                            # dereference many strings.
1114
 
                            content = content.copy()
1115
 
                        content.apply_delta(delta, version_id)
 
1177
                    content, delta = self.factory.parse_record(version_id,
 
1178
                        record, record_details, content,
 
1179
                        copy_base_content=multiple_versions)
1116
1180
                    if multiple_versions:
1117
1181
                        content_map[component_id] = content
1118
1182
 
1119
 
            if 'no-eol' in self._index.get_options(version_id):
1120
 
                if multiple_versions:
1121
 
                    content = content.copy()
1122
 
                content.strip_last_line_newline()
 
1183
            content.cleanup_eol(copy_on_mutate=multiple_versions)
1123
1184
            final_content[version_id] = content
1124
1185
 
1125
1186
            # digest here is the digest from the last applied component.
1426
1487
    def get_build_details(self, version_ids):
1427
1488
        """Get the method, index_memo and compression parent for version_ids.
1428
1489
 
 
1490
        Ghosts are omitted from the result.
 
1491
 
1429
1492
        :param version_ids: An iterable of version_ids.
1430
 
        :return: A dict of version_id:(method, index_memo, compression_parent).
 
1493
        :return: A dict of version_id:(index_memo, compression_parent,
 
1494
                                       parents, record_details).
 
1495
            index_memo
 
1496
                opaque structure to pass to read_records to extract the raw
 
1497
                data
 
1498
            compression_parent
 
1499
                Content that this record is built upon, may be None
 
1500
            parents
 
1501
                Logical parents of this node
 
1502
            record_details
 
1503
                extra information about the content which needs to be passed to
 
1504
                Factory.parse_record
1431
1505
        """
1432
1506
        result = {}
1433
1507
        for version_id in version_ids:
 
1508
            if version_id not in self._cache:
 
1509
                # ghosts are omitted
 
1510
                continue
1434
1511
            method = self.get_method(version_id)
 
1512
            parents = self.get_parents_with_ghosts(version_id)
1435
1513
            if method == 'fulltext':
1436
1514
                compression_parent = None
1437
1515
            else:
1438
 
                compression_parent = self.get_parents_with_ghosts(version_id)[0]
 
1516
                compression_parent = parents[0]
 
1517
            noeol = 'no-eol' in self.get_options(version_id)
1439
1518
            index_memo = self.get_position(version_id)
1440
 
            result[version_id] = (method, index_memo, compression_parent)
 
1519
            result[version_id] = (index_memo, compression_parent,
 
1520
                                  parents, (method, noeol))
1441
1521
        return result
1442
1522
 
1443
1523
    def iter_parents(self, version_ids):
1706
1786
    def get_build_details(self, version_ids):
1707
1787
        """Get the method, index_memo and compression parent for version_ids.
1708
1788
 
 
1789
        Ghosts are omitted from the result.
 
1790
 
1709
1791
        :param version_ids: An iterable of version_ids.
1710
 
        :return: A dict of version_id:(method, index_memo, compression_parent).
 
1792
        :return: A dict of version_id:(index_memo, compression_parent,
 
1793
                                       parents, record_details).
 
1794
            index_memo
 
1795
                opaque structure to pass to read_records to extract the raw
 
1796
                data
 
1797
            compression_parent
 
1798
                Content that this record is built upon, may be None
 
1799
            parents
 
1800
                Logical parents of this node
 
1801
            record_details
 
1802
                extra information about the content which needs to be passed to
 
1803
                Factory.parse_record
1711
1804
        """
1712
1805
        result = {}
1713
1806
        entries = self._get_entries(self._version_ids_to_keys(version_ids), True)
1714
1807
        for entry in entries:
1715
1808
            version_id = self._keys_to_version_ids((entry[1],))[0]
 
1809
            if not self._parents:
 
1810
                parents = ()
 
1811
            else:
 
1812
                parents = self._keys_to_version_ids(entry[3][0])
1716
1813
            if not self._deltas:
1717
1814
                compression_parent = None
1718
1815
            else:
1722
1819
                    (compression_parent_key,))[0]
1723
1820
                else:
1724
1821
                    compression_parent = None
 
1822
            noeol = (entry[2][0] == 'N')
1725
1823
            if compression_parent:
1726
1824
                method = 'line-delta'
1727
1825
            else:
1728
1826
                method = 'fulltext'
1729
 
            result[version_id] = (method, self._node_to_position(entry),
1730
 
                compression_parent)
 
1827
            result[version_id] = (self._node_to_position(entry),
 
1828
                                  compression_parent, parents,
 
1829
                                  (method, noeol))
1731
1830
        return result
1732
1831
 
1733
1832
    def _compression_parent(self, an_entry):
2174
2273
class _StreamIndex(object):
2175
2274
    """A Knit Index object that uses the data map from a datastream."""
2176
2275
 
2177
 
    def __init__(self, data_list):
 
2276
    def __init__(self, data_list, backing_index):
2178
2277
        """Create a _StreamIndex object.
2179
2278
 
2180
2279
        :param data_list: The data_list from the datastream.
 
2280
        :param backing_index: The index which will supply values for nodes
 
2281
            referenced outside of this stream.
2181
2282
        """
2182
2283
        self.data_list = data_list
 
2284
        self.backing_index = backing_index
2183
2285
        self._by_version = {}
2184
2286
        pos = 0
2185
2287
        for key, options, length, parents in data_list:
2212
2314
    def get_build_details(self, version_ids):
2213
2315
        """Get the method, index_memo and compression parent for version_ids.
2214
2316
 
 
2317
        Ghosts are omitted from the result.
 
2318
 
2215
2319
        :param version_ids: An iterable of version_ids.
2216
 
        :return: A dict of version_id:(method, index_memo, compression_parent).
 
2320
        :return: A dict of version_id:(index_memo, compression_parent,
 
2321
                                       parents, record_details).
 
2322
            index_memo
 
2323
                opaque structure to pass to read_records to extract the raw
 
2324
                data
 
2325
            compression_parent
 
2326
                Content that this record is built upon, may be None
 
2327
            parents
 
2328
                Logical parents of this node
 
2329
            record_details
 
2330
                extra information about the content which needs to be passed to
 
2331
                Factory.parse_record
2217
2332
        """
2218
2333
        result = {}
2219
2334
        for version_id in version_ids:
2220
 
            method = self.get_method(version_id)
 
2335
            try:
 
2336
                method = self.get_method(version_id)
 
2337
            except errors.RevisionNotPresent:
 
2338
                # ghosts are omitted
 
2339
                continue
 
2340
            parent_ids = self.get_parents_with_ghosts(version_id)
 
2341
            noeol = ('no-eol' in self.get_options(version_id))
2221
2342
            if method == 'fulltext':
2222
2343
                compression_parent = None
2223
2344
            else:
2224
 
                compression_parent = self.get_parents_with_ghosts(version_id)[0]
 
2345
                compression_parent = parent_ids[0]
2225
2346
            index_memo = self.get_position(version_id)
2226
 
            result[version_id] = (method, index_memo, compression_parent)
 
2347
            result[version_id] = (index_memo, compression_parent,
 
2348
                                  parent_ids, (method, noeol))
2227
2349
        return result
2228
2350
 
2229
2351
    def get_method(self, version_id):
2233
2355
        except KeyError:
2234
2356
            # Strictly speaking this should check in the backing knit, but
2235
2357
            # until we have a test to discriminate, this will do.
2236
 
            return 'fulltext'
 
2358
            return self.backing_index.get_method(version_id)
2237
2359
        if 'fulltext' in options:
2238
2360
            return 'fulltext'
2239
2361
        elif 'line-delta' in options:
2246
2368
 
2247
2369
        e.g. ['foo', 'bar']
2248
2370
        """
2249
 
        return self._by_version[version_id][0]
 
2371
        try:
 
2372
            return self._by_version[version_id][0]
 
2373
        except KeyError:
 
2374
            return self.backing_index.get_options(version_id)
2250
2375
 
2251
2376
    def get_parents_with_ghosts(self, version_id):
2252
2377
        """Return parents of specified version with ghosts."""
2253
 
        return self._by_version[version_id][2]
 
2378
        try:
 
2379
            return self._by_version[version_id][2]
 
2380
        except KeyError:
 
2381
            return self.backing_index.get_parents_with_ghosts(version_id)
2254
2382
 
2255
2383
    def get_position(self, version_id):
2256
2384
        """Return details needed to access the version.
2735
2863
    It will work for knits with cached annotations, but this is not
2736
2864
    recommended.
2737
2865
    """
2738
 
    ancestry = knit.get_ancestry(revision_id)
2739
 
    fulltext = dict(zip(ancestry, knit.get_line_list(ancestry)))
2740
 
    annotations = {}
2741
 
    for candidate in ancestry:
2742
 
        if candidate in annotations:
2743
 
            continue
2744
 
        parents = knit.get_parents(candidate)
2745
 
        if len(parents) == 0:
2746
 
            blocks = None
2747
 
        elif knit._index.get_method(candidate) != 'line-delta':
2748
 
            blocks = None
 
2866
    annotator = _KnitAnnotator(knit)
 
2867
    return iter(annotator.annotate(revision_id))
 
2868
 
 
2869
 
 
2870
class _KnitAnnotator(object):
 
2871
    """Build up the annotations for a text."""
 
2872
 
 
2873
    def __init__(self, knit):
 
2874
        self._knit = knit
 
2875
 
 
2876
        # Content objects, differs from fulltexts because of how final newlines
 
2877
        # are treated by knits. the content objects here will always have a
 
2878
        # final newline
 
2879
        self._fulltext_contents = {}
 
2880
 
 
2881
        # Annotated lines of specific revisions
 
2882
        self._annotated_lines = {}
 
2883
 
 
2884
        # Track the raw data for nodes that we could not process yet.
 
2885
        # This maps the revision_id of the base to a list of children that will
 
2886
        # annotated from it.
 
2887
        self._pending_children = {}
 
2888
 
 
2889
        # Nodes which cannot be extracted
 
2890
        self._ghosts = set()
 
2891
 
 
2892
        # Track how many children this node has, so we know if we need to keep
 
2893
        # it
 
2894
        self._annotate_children = {}
 
2895
        self._compression_children = {}
 
2896
 
 
2897
        self._all_build_details = {}
 
2898
        # The children => parent revision_id graph
 
2899
        self._revision_id_graph = {}
 
2900
 
 
2901
        self._heads_provider = None
 
2902
 
 
2903
        self._nodes_to_keep_annotations = set()
 
2904
        self._generations_until_keep = 100
 
2905
 
 
2906
    def set_generations_until_keep(self, value):
 
2907
        """Set the number of generations before caching a node.
 
2908
 
 
2909
        Setting this to -1 will cache every merge node, setting this higher
 
2910
        will cache fewer nodes.
 
2911
        """
 
2912
        self._generations_until_keep = value
 
2913
 
 
2914
    def _add_fulltext_content(self, revision_id, content_obj):
 
2915
        self._fulltext_contents[revision_id] = content_obj
 
2916
        # TODO: jam 20080305 It might be good to check the sha1digest here
 
2917
        return content_obj.text()
 
2918
 
 
2919
    def _check_parents(self, child, nodes_to_annotate):
 
2920
        """Check if all parents have been processed.
 
2921
 
 
2922
        :param child: A tuple of (rev_id, parents, raw_content)
 
2923
        :param nodes_to_annotate: If child is ready, add it to
 
2924
            nodes_to_annotate, otherwise put it back in self._pending_children
 
2925
        """
 
2926
        for parent_id in child[1]:
 
2927
            if (parent_id not in self._annotated_lines):
 
2928
                # This parent is present, but another parent is missing
 
2929
                self._pending_children.setdefault(parent_id,
 
2930
                                                  []).append(child)
 
2931
                break
2749
2932
        else:
2750
 
            parent, sha1, noeol, delta = knit.get_delta(candidate)
2751
 
            blocks = KnitContent.get_line_delta_blocks(delta,
2752
 
                fulltext[parents[0]], fulltext[candidate])
2753
 
        annotations[candidate] = list(annotate.reannotate([annotations[p]
2754
 
            for p in parents], fulltext[candidate], candidate, blocks))
2755
 
    return iter(annotations[revision_id])
 
2933
            # This one is ready to be processed
 
2934
            nodes_to_annotate.append(child)
 
2935
 
 
2936
    def _add_annotation(self, revision_id, fulltext, parent_ids,
 
2937
                        left_matching_blocks=None):
 
2938
        """Add an annotation entry.
 
2939
 
 
2940
        All parents should already have been annotated.
 
2941
        :return: A list of children that now have their parents satisfied.
 
2942
        """
 
2943
        a = self._annotated_lines
 
2944
        annotated_parent_lines = [a[p] for p in parent_ids]
 
2945
        annotated_lines = list(annotate.reannotate(annotated_parent_lines,
 
2946
            fulltext, revision_id, left_matching_blocks,
 
2947
            heads_provider=self._get_heads_provider()))
 
2948
        self._annotated_lines[revision_id] = annotated_lines
 
2949
        for p in parent_ids:
 
2950
            ann_children = self._annotate_children[p]
 
2951
            ann_children.remove(revision_id)
 
2952
            if (not ann_children
 
2953
                and p not in self._nodes_to_keep_annotations):
 
2954
                del self._annotated_lines[p]
 
2955
                del self._all_build_details[p]
 
2956
                if p in self._fulltext_contents:
 
2957
                    del self._fulltext_contents[p]
 
2958
        # Now that we've added this one, see if there are any pending
 
2959
        # deltas to be done, certainly this parent is finished
 
2960
        nodes_to_annotate = []
 
2961
        for child in self._pending_children.pop(revision_id, []):
 
2962
            self._check_parents(child, nodes_to_annotate)
 
2963
        return nodes_to_annotate
 
2964
 
 
2965
    def _get_build_graph(self, revision_id):
 
2966
        """Get the graphs for building texts and annotations.
 
2967
 
 
2968
        The data you need for creating a full text may be different than the
 
2969
        data you need to annotate that text. (At a minimum, you need both
 
2970
        parents to create an annotation, but only need 1 parent to generate the
 
2971
        fulltext.)
 
2972
 
 
2973
        :return: A list of (revision_id, index_memo) records, suitable for
 
2974
            passing to read_records_iter to start reading in the raw data fro/
 
2975
            the pack file.
 
2976
        """
 
2977
        if revision_id in self._annotated_lines:
 
2978
            # Nothing to do
 
2979
            return []
 
2980
        pending = set([revision_id])
 
2981
        records = []
 
2982
        generation = 0
 
2983
        kept_generation = 0
 
2984
        while pending:
 
2985
            # get all pending nodes
 
2986
            generation += 1
 
2987
            this_iteration = pending
 
2988
            build_details = self._knit._index.get_build_details(this_iteration)
 
2989
            self._all_build_details.update(build_details)
 
2990
            # new_nodes = self._knit._index._get_entries(this_iteration)
 
2991
            pending = set()
 
2992
            for rev_id, details in build_details.iteritems():
 
2993
                (index_memo, compression_parent, parents,
 
2994
                 record_details) = details
 
2995
                self._revision_id_graph[rev_id] = parents
 
2996
                records.append((rev_id, index_memo))
 
2997
                # Do we actually need to check _annotated_lines?
 
2998
                pending.update(p for p in parents
 
2999
                                 if p not in self._all_build_details)
 
3000
                if compression_parent:
 
3001
                    self._compression_children.setdefault(compression_parent,
 
3002
                        []).append(rev_id)
 
3003
                if parents:
 
3004
                    for parent in parents:
 
3005
                        self._annotate_children.setdefault(parent,
 
3006
                            []).append(rev_id)
 
3007
                    num_gens = generation - kept_generation
 
3008
                    if ((num_gens >= self._generations_until_keep)
 
3009
                        and len(parents) > 1):
 
3010
                        kept_generation = generation
 
3011
                        self._nodes_to_keep_annotations.add(rev_id)
 
3012
 
 
3013
            missing_versions = this_iteration.difference(build_details.keys())
 
3014
            self._ghosts.update(missing_versions)
 
3015
            for missing_version in missing_versions:
 
3016
                # add a key, no parents
 
3017
                self._revision_id_graph[missing_version] = ()
 
3018
                pending.discard(missing_version) # don't look for it
 
3019
        # XXX: This should probably be a real exception, as it is a data
 
3020
        #      inconsistency
 
3021
        assert not self._ghosts.intersection(self._compression_children), \
 
3022
            "We cannot have nodes which have a compression parent of a ghost."
 
3023
        # Cleanout anything that depends on a ghost so that we don't wait for
 
3024
        # the ghost to show up
 
3025
        for node in self._ghosts:
 
3026
            if node in self._annotate_children:
 
3027
                # We won't be building this node
 
3028
                del self._annotate_children[node]
 
3029
        # Generally we will want to read the records in reverse order, because
 
3030
        # we find the parent nodes after the children
 
3031
        records.reverse()
 
3032
        return records
 
3033
 
 
3034
    def _annotate_records(self, records):
 
3035
        """Build the annotations for the listed records."""
 
3036
        # We iterate in the order read, rather than a strict order requested
 
3037
        # However, process what we can, and put off to the side things that
 
3038
        # still need parents, cleaning them up when those parents are
 
3039
        # processed.
 
3040
        for (rev_id, record,
 
3041
             digest) in self._knit._data.read_records_iter(records):
 
3042
            if rev_id in self._annotated_lines:
 
3043
                continue
 
3044
            parent_ids = self._revision_id_graph[rev_id]
 
3045
            parent_ids = [p for p in parent_ids if p not in self._ghosts]
 
3046
            details = self._all_build_details[rev_id]
 
3047
            (index_memo, compression_parent, parents,
 
3048
             record_details) = details
 
3049
            nodes_to_annotate = []
 
3050
            # TODO: Remove the punning between compression parents, and
 
3051
            #       parent_ids, we should be able to do this without assuming
 
3052
            #       the build order
 
3053
            if len(parent_ids) == 0:
 
3054
                # There are no parents for this node, so just add it
 
3055
                # TODO: This probably needs to be decoupled
 
3056
                assert compression_parent is None
 
3057
                fulltext_content, delta = self._knit.factory.parse_record(
 
3058
                    rev_id, record, record_details, None)
 
3059
                fulltext = self._add_fulltext_content(rev_id, fulltext_content)
 
3060
                nodes_to_annotate.extend(self._add_annotation(rev_id, fulltext,
 
3061
                    parent_ids, left_matching_blocks=None))
 
3062
            else:
 
3063
                child = (rev_id, parent_ids, record)
 
3064
                # Check if all the parents are present
 
3065
                self._check_parents(child, nodes_to_annotate)
 
3066
            while nodes_to_annotate:
 
3067
                # Should we use a queue here instead of a stack?
 
3068
                (rev_id, parent_ids, record) = nodes_to_annotate.pop()
 
3069
                (index_memo, compression_parent, parents,
 
3070
                 record_details) = self._all_build_details[rev_id]
 
3071
                if compression_parent is not None:
 
3072
                    comp_children = self._compression_children[compression_parent]
 
3073
                    assert rev_id in comp_children
 
3074
                    # If there is only 1 child, it is safe to reuse this
 
3075
                    # content
 
3076
                    reuse_content = (len(comp_children) == 1
 
3077
                        and compression_parent not in
 
3078
                            self._nodes_to_keep_annotations)
 
3079
                    if reuse_content:
 
3080
                        # Remove it from the cache since it will be changing
 
3081
                        parent_fulltext_content = self._fulltext_contents.pop(compression_parent)
 
3082
                        # Make sure to copy the fulltext since it might be
 
3083
                        # modified
 
3084
                        parent_fulltext = list(parent_fulltext_content.text())
 
3085
                    else:
 
3086
                        parent_fulltext_content = self._fulltext_contents[compression_parent]
 
3087
                        parent_fulltext = parent_fulltext_content.text()
 
3088
                    comp_children.remove(rev_id)
 
3089
                    fulltext_content, delta = self._knit.factory.parse_record(
 
3090
                        rev_id, record, record_details,
 
3091
                        parent_fulltext_content,
 
3092
                        copy_base_content=(not reuse_content))
 
3093
                    fulltext = self._add_fulltext_content(rev_id,
 
3094
                                                          fulltext_content)
 
3095
                    blocks = KnitContent.get_line_delta_blocks(delta,
 
3096
                            parent_fulltext, fulltext)
 
3097
                else:
 
3098
                    fulltext_content = self._knit.factory.parse_fulltext(
 
3099
                        record, rev_id)
 
3100
                    fulltext = self._add_fulltext_content(rev_id,
 
3101
                        fulltext_content)
 
3102
                    blocks = None
 
3103
                nodes_to_annotate.extend(
 
3104
                    self._add_annotation(rev_id, fulltext, parent_ids,
 
3105
                                     left_matching_blocks=blocks))
 
3106
 
 
3107
    def _get_heads_provider(self):
 
3108
        """Create a heads provider for resolving ancestry issues."""
 
3109
        if self._heads_provider is not None:
 
3110
            return self._heads_provider
 
3111
        parent_provider = _mod_graph.DictParentsProvider(
 
3112
            self._revision_id_graph)
 
3113
        graph_obj = _mod_graph.Graph(parent_provider)
 
3114
        head_cache = _mod_graph.FrozenHeadsCache(graph_obj)
 
3115
        self._heads_provider = head_cache
 
3116
        return head_cache
 
3117
 
 
3118
    def annotate(self, revision_id):
 
3119
        """Return the annotated fulltext at the given revision.
 
3120
 
 
3121
        :param revision_id: The revision id for this file
 
3122
        """
 
3123
        records = self._get_build_graph(revision_id)
 
3124
        if revision_id in self._ghosts:
 
3125
            raise errors.RevisionNotPresent(revision_id, self._knit)
 
3126
        self._annotate_records(records)
 
3127
        return self._annotated_lines[revision_id]
2756
3128
 
2757
3129
 
2758
3130
try: