1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Functionality for doing annotations in the 'optimal' way"""
19
from bzrlib import errors, graph as _mod_graph, osutils, patiencediff, ui
22
cdef class _NeededTextIterator:
25
cdef object text_cache
27
cdef object stream_len
30
def __init__(self, stream, text_cache, stream_len, pb=None):
33
self.stream_len = stream_len
34
self.text_cache = text_cache
35
self.stream_len = stream_len
42
record = self.stream.next()
43
if self.pb is not None:
44
self.pb.update('extracting', self.counter, self.stream_len)
45
self.counter = self.counter + 1
46
lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
47
num_lines = len(lines)
48
self.text_cache[record.key] = lines
49
return record.key, lines, num_lines
53
"""Class that drives performing annotations."""
55
def __init__(self, vf):
56
"""Create a new Annotator from a VersionedFile."""
60
# Map from key => number of nexts that will be built from this key
61
self._num_needed_children = {}
62
self._annotations_cache = {}
63
self._heads_provider = None
65
def _get_needed_keys(self, key):
66
graph = _mod_graph.Graph(self._vf)
68
# We need 1 extra copy of the node we will be looking at when we are
70
self._num_needed_children[key] = 1
71
for key, parent_keys in graph.iter_ancestry([key]):
72
if parent_keys is None:
74
parent_map[key] = parent_keys
75
for parent_key in parent_keys:
76
if parent_key in self._num_needed_children:
77
self._num_needed_children[parent_key] += 1
79
self._num_needed_children[parent_key] = 1
80
self._parent_map.update(parent_map)
81
# _heads_provider does some graph caching, so it is only valid while
82
# self._parent_map hasn't changed
83
self._heads_provider = None
84
keys = parent_map.keys()
87
def _get_needed_texts(self, key, pb=None):
88
"""Get the texts we need to properly annotate key.
90
:param key: A Key that is present in self._vf
91
:return: Yield (this_key, text, num_lines)
92
'text' is an opaque object that just has to work with whatever
93
matcher object we are using. Currently it is always 'lines' but
94
future improvements may change this to a simple text string.
96
keys = self._get_needed_keys(key)
98
pb.update('getting stream', 0, len(keys))
99
stream = self._vf.get_record_stream(keys, 'topological', True)
100
iterator = _NeededTextIterator(stream, self._text_cache, len(keys), pb)
103
def _get_parent_annotations_and_matches(self, key, text, parent_key):
104
"""Get the list of annotations for the parent, and the matching lines.
106
:param text: The opaque value given by _get_needed_texts
107
:param parent_key: The key for the parent text
108
:return: (parent_annotations, matching_blocks)
109
parent_annotations is a list as long as the number of lines in
111
matching_blocks is a list of (parent_idx, text_idx, len) tuples
112
indicating which lines match between the two texts
114
parent_lines = self._text_cache[parent_key]
115
parent_annotations = self._annotations_cache[parent_key]
116
# PatienceSequenceMatcher should probably be part of Policy
117
matcher = patiencediff.PatienceSequenceMatcher(None,
119
matching_blocks = matcher.get_matching_blocks()
120
return parent_annotations, matching_blocks
122
def _update_from_one_parent(self, key, annotations, lines, parent_key):
123
"""Reannotate this text relative to its first parent."""
124
parent_annotations, matching_blocks = self._get_parent_annotations_and_matches(
125
key, lines, parent_key)
127
for parent_idx, lines_idx, match_len in matching_blocks:
128
# For all matching regions we copy across the parent annotations
129
annotations[lines_idx:lines_idx + match_len] = \
130
parent_annotations[parent_idx:parent_idx + match_len]
132
def _update_from_other_parents(self, key, annotations, lines,
133
this_annotation, parent_key):
134
"""Reannotate this text relative to a second (or more) parent."""
135
parent_annotations, matching_blocks = self._get_parent_annotations_and_matches(
136
key, lines, parent_key)
141
# TODO: consider making all annotations unique and then using 'is'
142
# everywhere. Current results claim that isn't any faster,
143
# because of the time spent deduping
144
# deduping also saves a bit of memory. For NEWS it saves ~1MB,
145
# but that is out of 200-300MB for extracting everything, so a
146
# fairly trivial amount
147
for parent_idx, lines_idx, match_len in matching_blocks:
148
# For lines which match this parent, we will now resolve whether
149
# this parent wins over the current annotation
150
ann_sub = annotations[lines_idx:lines_idx + match_len]
151
par_sub = parent_annotations[parent_idx:parent_idx + match_len]
152
if ann_sub == par_sub:
154
for idx in xrange(match_len):
156
par_ann = par_sub[idx]
157
ann_idx = lines_idx + idx
161
if ann == this_annotation:
162
# Originally claimed 'this', but it was really in this
164
annotations[ann_idx] = par_ann
166
# Resolve the fact that both sides have a different value for
168
if ann == last_ann and par_ann == last_parent:
169
annotations[ann_idx] = last_res
172
new_ann.update(par_ann)
173
new_ann = tuple(sorted(new_ann))
174
annotations[ann_idx] = new_ann
176
last_parent = par_ann
179
def _record_annotation(self, key, parent_keys, annotations):
180
self._annotations_cache[key] = annotations
181
for parent_key in parent_keys:
182
num = self._num_needed_children[parent_key]
185
del self._text_cache[parent_key]
186
del self._annotations_cache[parent_key]
187
# Do we want to clean up _num_needed_children at this point as
189
self._num_needed_children[parent_key] = num
191
def _annotate_one(self, key, text, num_lines):
192
this_annotation = (key,)
193
# Note: annotations will be mutated by calls to _update_from*
194
annotations = [this_annotation] * num_lines
195
parent_keys = self._parent_map[key]
197
self._update_from_one_parent(key, annotations, text, parent_keys[0])
198
for parent in parent_keys[1:]:
199
self._update_from_other_parents(key, annotations, text,
200
this_annotation, parent)
201
self._record_annotation(key, parent_keys, annotations)
203
def annotate(self, key):
204
"""Return annotated fulltext for the given key."""
205
keys = self._get_needed_texts(key)
206
pb = ui.ui_factory.nested_progress_bar()
208
for text_key, text, num_lines in self._get_needed_texts(key, pb=pb):
209
self._annotate_one(text_key, text, num_lines)
213
annotations = self._annotations_cache[key]
215
raise errors.RevisionNotPresent(key, self._vf)
216
return annotations, self._text_cache[key]
218
def _get_heads_provider(self):
219
if self._heads_provider is None:
220
self._heads_provider = _mod_graph.KnownGraph(self._parent_map)
221
return self._heads_provider
223
def annotate_flat(self, key):
224
"""Determine the single-best-revision to source for each line.
226
This is meant as a compatibility thunk to how annotate() used to work.
228
annotations, lines = self.annotate(key)
229
assert len(annotations) == len(lines)
231
heads = self._get_heads_provider().heads
233
for annotation, line in zip(annotations, lines):
234
if len(annotation) == 1:
235
append((annotation[0], line))
237
the_heads = heads(annotation)
238
if len(the_heads) == 1:
239
for head in the_heads:
242
# We need to resolve the ambiguity, for now just pick the
244
head = sorted(the_heads)[0]