~bzr-pqm/bzr/bzr.dev

Viewing changes to bzrlib/_annotator_pyx.pyx

Committer: John Arbash Meinel
Date: 2009-06-23 20:20:39 UTC
mto: This revision was merged to the branch mainline in revision 4522.
Revision ID: john@arbash-meinel.com-20090623202039-kr6mxu576z5vc4y5

Initial implementation of a Pyrex annotator.

Drops the time down to 8.6s down from 9.7s.
This implementation is basically just a copy of the python one, so I'm
a bit surprised to see that much of a win already.

files added:
bzrlib/_annotator_pyx.pyx

files modified:
.bzrignore

bzrlib/annotate.py

bzrlib/knit.py

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/_annotator_pyx.pyx

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Functionality for doing annotations in the 'optimal' way"""

from bzrlib import errors, graph as _mod_graph, osutils, patiencediff, ui

cdef class _NeededTextIterator:

cdef object counter

cdef object text_cache

cdef object stream

cdef object stream_len

cdef object pb

def __init__(self, stream, text_cache, stream_len, pb=None):

self.counter = 0

self.stream = stream

self.stream_len = stream_len

self.text_cache = text_cache

self.stream_len = stream_len

self.pb = pb

def __iter__(self):

return self

def __next__(self):

record = self.stream.next()

if self.pb is not None:

self.pb.update('extracting', self.counter, self.stream_len)

self.counter = self.counter + 1

lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))

num_lines = len(lines)

self.text_cache[record.key] = lines

return record.key, lines, num_lines

class Annotator:

"""Class that drives performing annotations."""

def __init__(self, vf):

"""Create a new Annotator from a VersionedFile."""

self._vf = vf

self._parent_map = {}

self._text_cache = {}

# Map from key => number of nexts that will be built from this key

self._num_needed_children = {}

self._annotations_cache = {}

self._heads_provider = None

def _get_needed_keys(self, key):

graph = _mod_graph.Graph(self._vf)

parent_map = {}

# We need 1 extra copy of the node we will be looking at when we are

# done

self._num_needed_children[key] = 1

for key, parent_keys in graph.iter_ancestry([key]):

if parent_keys is None:

continue

parent_map[key] = parent_keys

for parent_key in parent_keys:

if parent_key in self._num_needed_children:

self._num_needed_children[parent_key] += 1

else:

self._num_needed_children[parent_key] = 1

self._parent_map.update(parent_map)

# _heads_provider does some graph caching, so it is only valid while

# self._parent_map hasn't changed

self._heads_provider = None

keys = parent_map.keys()

return keys

def _get_needed_texts(self, key, pb=None):

"""Get the texts we need to properly annotate key.

:param key: A Key that is present in self._vf

:return: Yield (this_key, text, num_lines)

'text' is an opaque object that just has to work with whatever

matcher object we are using. Currently it is always 'lines' but

future improvements may change this to a simple text string.

"""

keys = self._get_needed_keys(key)

if pb is not None:

pb.update('getting stream', 0, len(keys))

stream = self._vf.get_record_stream(keys, 'topological', True)

100

iterator = _NeededTextIterator(stream, self._text_cache, len(keys), pb)

101

return iterator

102

103

def _get_parent_annotations_and_matches(self, key, text, parent_key):

104

"""Get the list of annotations for the parent, and the matching lines.

105

106

:param text: The opaque value given by _get_needed_texts

107

:param parent_key: The key for the parent text

108

:return: (parent_annotations, matching_blocks)

109

parent_annotations is a list as long as the number of lines in

110

parent

111

matching_blocks is a list of (parent_idx, text_idx, len) tuples

112

indicating which lines match between the two texts

113

"""

114

parent_lines = self._text_cache[parent_key]

115

parent_annotations = self._annotations_cache[parent_key]

116

# PatienceSequenceMatcher should probably be part of Policy

117

matcher = patiencediff.PatienceSequenceMatcher(None,

118

parent_lines, text)

119

matching_blocks = matcher.get_matching_blocks()

120

return parent_annotations, matching_blocks

121

122

def _update_from_one_parent(self, key, annotations, lines, parent_key):

123

"""Reannotate this text relative to its first parent."""

124

parent_annotations, matching_blocks = self._get_parent_annotations_and_matches(

125

key, lines, parent_key)

126

127

for parent_idx, lines_idx, match_len in matching_blocks:

128

# For all matching regions we copy across the parent annotations

129

annotations[lines_idx:lines_idx + match_len] = \

130

parent_annotations[parent_idx:parent_idx + match_len]

131

132

def _update_from_other_parents(self, key, annotations, lines,

133

this_annotation, parent_key):

134

"""Reannotate this text relative to a second (or more) parent."""

135

parent_annotations, matching_blocks = self._get_parent_annotations_and_matches(

136

key, lines, parent_key)

137

138

last_ann = None

139

last_parent = None

140

last_res = None

141

# TODO: consider making all annotations unique and then using 'is'

142

# everywhere. Current results claim that isn't any faster,

143

# because of the time spent deduping

144

# deduping also saves a bit of memory. For NEWS it saves ~1MB,

145

# but that is out of 200-300MB for extracting everything, so a

146

# fairly trivial amount

147

for parent_idx, lines_idx, match_len in matching_blocks:

148

# For lines which match this parent, we will now resolve whether

149

# this parent wins over the current annotation

150

ann_sub = annotations[lines_idx:lines_idx + match_len]

151

par_sub = parent_annotations[parent_idx:parent_idx + match_len]

152

if ann_sub == par_sub:

153

continue

154

for idx in xrange(match_len):

155

ann = ann_sub[idx]

156

par_ann = par_sub[idx]

157

ann_idx = lines_idx + idx

158

if ann == par_ann:

159

# Nothing to change

160

continue

161

if ann == this_annotation:

162

# Originally claimed 'this', but it was really in this

163

# parent

164

annotations[ann_idx] = par_ann

165

continue

166

# Resolve the fact that both sides have a different value for

167

# last modified

168

if ann == last_ann and par_ann == last_parent:

169

annotations[ann_idx] = last_res

170

else:

171

new_ann = set(ann)

172

new_ann.update(par_ann)

173

new_ann = tuple(sorted(new_ann))

174

annotations[ann_idx] = new_ann

175

last_ann = ann

176

last_parent = par_ann

177

last_res = new_ann

178

179

def _record_annotation(self, key, parent_keys, annotations):

180

self._annotations_cache[key] = annotations

181

for parent_key in parent_keys:

182

num = self._num_needed_children[parent_key]

183

num -= 1

184

if num == 0:

185

del self._text_cache[parent_key]

186

del self._annotations_cache[parent_key]

187

# Do we want to clean up _num_needed_children at this point as

188

# well?

189

self._num_needed_children[parent_key] = num

190

191

def _annotate_one(self, key, text, num_lines):

192

this_annotation = (key,)

193

# Note: annotations will be mutated by calls to _update_from*

194

annotations = [this_annotation] * num_lines

195

parent_keys = self._parent_map[key]

196

if parent_keys:

197

self._update_from_one_parent(key, annotations, text, parent_keys[0])

198

for parent in parent_keys[1:]:

199

self._update_from_other_parents(key, annotations, text,

200

this_annotation, parent)

201

self._record_annotation(key, parent_keys, annotations)

202

203

def annotate(self, key):

204

"""Return annotated fulltext for the given key."""

205

keys = self._get_needed_texts(key)

206

pb = ui.ui_factory.nested_progress_bar()

207

try:

208

for text_key, text, num_lines in self._get_needed_texts(key, pb=pb):

209

self._annotate_one(text_key, text, num_lines)

210

finally:

211

pb.finished()

212

try:

213

annotations = self._annotations_cache[key]

214

except KeyError:

215

raise errors.RevisionNotPresent(key, self._vf)

216

return annotations, self._text_cache[key]

217

218

def _get_heads_provider(self):

219

if self._heads_provider is None:

220

self._heads_provider = _mod_graph.KnownGraph(self._parent_map)

221

return self._heads_provider

222

223

def annotate_flat(self, key):

224

"""Determine the single-best-revision to source for each line.

225

226

This is meant as a compatibility thunk to how annotate() used to work.

227

"""

228

annotations, lines = self.annotate(key)

229

assert len(annotations) == len(lines)

230

out = []

231

heads = self._get_heads_provider().heads

232

append = out.append

233

for annotation, line in zip(annotations, lines):

234

if len(annotation) == 1:

235

append((annotation[0], line))

236

else:

237

the_heads = heads(annotation)

238

if len(the_heads) == 1:

239

for head in the_heads:

240

break

241

else:

242

# We need to resolve the ambiguity, for now just pick the

243

# sorted smallest

244

head = sorted(the_heads)[0]

245

append((head, line))

246

return out

Older »