1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
"""Knit versionedfile implementation.
22
A knit is a versioned file implementation that supports efficient append only
26
lifeless: the data file is made up of "delta records". each delta record has a delta header
27
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
28
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
29
end-marker; simply "end VERSION"
31
delta can be line or full contents.a
32
... the 8's there are the index number of the annotation.
33
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
37
8 e.set('executable', 'yes')
39
8 if elt.get('executable') == 'yes':
40
8 ie.executable = True
41
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
45
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
46
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
47
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
48
09:33 < lifeless> right
49
09:33 < jrydberg> lifeless: the position and size is the range in the data file
52
so the index sequence is the dictionary compressed sequence number used
53
in the deltas to provide line annotation
58
# 10:16 < lifeless> make partial index writes safe
59
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
60
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
62
# move sha1 out of the content so that join is faster at verifying parents
63
# record content length ?
67
from cStringIO import StringIO
69
from itertools import izip, chain
74
import bzrlib.errors as errors
75
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
76
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
77
RevisionNotPresent, RevisionAlreadyPresent
78
from bzrlib.tuned_gzip import *
79
from bzrlib.trace import mutter
80
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
82
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
83
from bzrlib.tsort import topo_sort
87
# TODO: Split out code specific to this format into an associated object.
89
# TODO: Can we put in some kind of value to check that the index and data
90
# files belong together?
92
# TODO: accomodate binaries, perhaps by storing a byte count
94
# TODO: function to check whole file
96
# TODO: atomically append data, then measure backwards from the cursor
97
# position after writing to work out where it was located. we may need to
98
# bypass python file buffering.
100
DATA_SUFFIX = '.knit'
101
INDEX_SUFFIX = '.kndx'
104
class KnitContent(object):
105
"""Content of a knit version to which deltas can be applied."""
107
def __init__(self, lines):
110
def annotate_iter(self):
111
"""Yield tuples of (origin, text) for each content line."""
112
for origin, text in self._lines:
116
"""Return a list of (origin, text) tuples."""
117
return list(self.annotate_iter())
119
def line_delta_iter(self, new_lines):
120
"""Generate line-based delta from this content to new_lines."""
121
new_texts = [text for origin, text in new_lines._lines]
122
old_texts = [text for origin, text in self._lines]
123
s = SequenceMatcher(None, old_texts, new_texts)
124
for op in s.get_opcodes():
127
# ofrom oto length data
128
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
130
def line_delta(self, new_lines):
131
return list(self.line_delta_iter(new_lines))
134
return [text for origin, text in self._lines]
137
class _KnitFactory(object):
138
"""Base factory for creating content objects."""
140
def make(self, lines, version):
141
num_lines = len(lines)
142
return KnitContent(zip([version] * num_lines, lines))
145
class KnitAnnotateFactory(_KnitFactory):
146
"""Factory for creating annotated Content objects."""
150
def parse_fulltext(self, content, version):
151
"""Convert fulltext to internal representation
153
fulltext content is of the format
154
revid(utf8) plaintext\n
155
internal representation is of the format:
160
origin, text = line.split(' ', 1)
161
lines.append((origin.decode('utf-8'), text))
162
return KnitContent(lines)
164
def parse_line_delta_iter(self, lines):
165
for result_item in self.parse_line_delta[lines]:
168
def parse_line_delta(self, lines, version):
169
"""Convert a line based delta into internal representation.
171
line delta is in the form of:
172
intstart intend intcount
174
revid(utf8) newline\n
175
internal represnetation is
176
(start, end, count, [1..count tuples (revid, newline)])
181
# walk through the lines parsing.
183
start, end, count = [int(n) for n in header.split(',')]
187
origin, text = next().split(' ', 1)
189
contents.append((origin.decode('utf-8'), text))
190
result.append((start, end, count, contents))
193
def lower_fulltext(self, content):
194
"""convert a fulltext content record into a serializable form.
196
see parse_fulltext which this inverts.
198
return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]
200
def lower_line_delta(self, delta):
201
"""convert a delta into a serializable form.
203
See parse_line_delta which this inverts.
206
for start, end, c, lines in delta:
207
out.append('%d,%d,%d\n' % (start, end, c))
208
for origin, text in lines:
209
out.append('%s %s' % (origin.encode('utf-8'), text))
213
class KnitPlainFactory(_KnitFactory):
214
"""Factory for creating plain Content objects."""
218
def parse_fulltext(self, content, version):
219
"""This parses an unannotated fulltext.
221
Note that this is not a noop - the internal representation
222
has (versionid, line) - its just a constant versionid.
224
return self.make(content, version)
226
def parse_line_delta_iter(self, lines, version):
228
header = lines.pop(0)
229
start, end, c = [int(n) for n in header.split(',')]
230
yield start, end, c, zip([version] * c, lines[:c])
233
def parse_line_delta(self, lines, version):
234
return list(self.parse_line_delta_iter(lines, version))
236
def lower_fulltext(self, content):
237
return content.text()
239
def lower_line_delta(self, delta):
241
for start, end, c, lines in delta:
242
out.append('%d,%d,%d\n' % (start, end, c))
243
out.extend([text for origin, text in lines])
247
def make_empty_knit(transport, relpath):
248
"""Construct a empty knit at the specified location."""
249
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
253
class KnitVersionedFile(VersionedFile):
254
"""Weave-like structure with faster random access.
256
A knit stores a number of texts and a summary of the relationships
257
between them. Texts are identified by a string version-id. Texts
258
are normally stored and retrieved as a series of lines, but can
259
also be passed as single strings.
261
Lines are stored with the trailing newline (if any) included, to
262
avoid special cases for files with no final newline. Lines are
263
composed of 8-bit characters, not unicode. The combination of
264
these approaches should mean any 'binary' file can be safely
265
stored and retrieved.
268
def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,
269
basis_knit=None, delta=True, create=False):
270
"""Construct a knit at location specified by relpath.
272
:param create: If not True, only open an existing knit.
274
if access_mode is None:
276
super(KnitVersionedFile, self).__init__(access_mode)
277
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
278
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
281
self.transport = transport
282
self.filename = relpath
283
self.basis_knit = basis_knit
284
self.factory = factory or KnitAnnotateFactory()
285
self.writable = (access_mode == 'w')
288
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
289
access_mode, create=create, file_mode=file_mode)
290
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
291
access_mode, create=create and not len(self), file_mode=file_mode)
293
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
294
"""See VersionedFile._add_delta()."""
295
self._check_add(version_id, []) # should we check the lines ?
296
self._check_versions_present(parents)
300
for parent in parents:
301
if not self.has_version(parent):
302
ghosts.append(parent)
304
present_parents.append(parent)
306
if delta_parent is None:
307
# reconstitute as full text.
308
assert len(delta) == 1 or len(delta) == 0
310
assert delta[0][0] == 0
311
assert delta[0][1] == 0, delta[0][1]
312
return super(KnitVersionedFile, self)._add_delta(version_id,
323
options.append('no-eol')
325
if delta_parent is not None:
326
# determine the current delta chain length.
327
# To speed the extract of texts the delta chain is limited
328
# to a fixed number of deltas. This should minimize both
329
# I/O and the time spend applying deltas.
331
delta_parents = [delta_parent]
333
parent = delta_parents[0]
334
method = self._index.get_method(parent)
335
if method == 'fulltext':
337
delta_parents = self._index.get_parents(parent)
339
if method == 'line-delta':
340
# did not find a fulltext in the delta limit.
341
# just do a normal insertion.
342
return super(KnitVersionedFile, self)._add_delta(version_id,
349
options.append('line-delta')
350
store_lines = self.factory.lower_line_delta(delta)
352
where, size = self._data.add_record(version_id, digest, store_lines)
353
self._index.add_version(version_id, options, where, size, parents)
355
def clear_cache(self):
356
"""Clear the data cache only."""
357
self._data.clear_cache()
359
def copy_to(self, name, transport):
360
"""See VersionedFile.copy_to()."""
361
# copy the current index to a temp index to avoid racing with local
363
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)
365
transport.put(name + DATA_SUFFIX, self._data._open_file())
366
# rename the copied index into place
367
transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
369
def create_empty(self, name, transport, mode=None):
370
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
372
def _fix_parents(self, version, new_parents):
373
"""Fix the parents list for version.
375
This is done by appending a new version to the index
376
with identical data except for the parents list.
377
the parents list must be a superset of the current
380
current_values = self._index._cache[version]
381
assert set(current_values[4]).difference(set(new_parents)) == set()
382
self._index.add_version(version,
388
def get_delta(self, version_id):
389
"""Get a delta for constructing version from some other version."""
390
if not self.has_version(version_id):
391
raise RevisionNotPresent(version_id, self.filename)
393
parents = self.get_parents(version_id)
398
data_pos, data_size = self._index.get_position(version_id)
399
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
400
version_idx = self._index.lookup(version_id)
401
noeol = 'no-eol' in self._index.get_options(version_id)
402
if 'fulltext' == self._index.get_method(version_id):
403
new_content = self.factory.parse_fulltext(data, version_idx)
404
if parent is not None:
405
reference_content = self._get_content(parent)
406
old_texts = reference_content.text()
409
new_texts = new_content.text()
410
delta_seq = SequenceMatcher(None, old_texts, new_texts)
411
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
413
delta = self.factory.parse_line_delta(data, version_idx)
414
return parent, sha1, noeol, delta
416
def get_graph_with_ghosts(self):
417
"""See VersionedFile.get_graph_with_ghosts()."""
418
graph_items = self._index.get_graph()
419
return dict(graph_items)
421
def get_sha1(self, version_id):
422
"""See VersionedFile.get_sha1()."""
423
components = self._get_components(version_id)
424
return components[-1][-1][-1]
428
"""See VersionedFile.get_suffixes()."""
429
return [DATA_SUFFIX, INDEX_SUFFIX]
431
def has_ghost(self, version_id):
432
"""True if there is a ghost reference in the file to version_id."""
434
if self.has_version(version_id):
436
# optimisable if needed by memoising the _ghosts set.
437
items = self._index.get_graph()
438
for node, parents in items:
439
for parent in parents:
440
if parent not in self._index._cache:
441
if parent == version_id:
446
"""See VersionedFile.versions."""
447
return self._index.get_versions()
449
def has_version(self, version_id):
450
"""See VersionedFile.has_version."""
451
return self._index.has_version(version_id)
453
__contains__ = has_version
455
def _merge_annotations(self, content, parents, parent_texts={},
456
delta=None, annotated=None):
457
"""Merge annotations for content. This is done by comparing
458
the annotations based on changed to the text.
462
for parent_id in parents:
463
merge_content = self._get_content(parent_id, parent_texts)
464
seq = SequenceMatcher(None, merge_content.text(), content.text())
465
if delta_seq is None:
466
# setup a delta seq to reuse.
468
for i, j, n in seq.get_matching_blocks():
471
# this appears to copy (origin, text) pairs across to the new
472
# content for any line that matches the last-checked parent.
473
# FIXME: save the sequence control data for delta compression
474
# against the most relevant parent rather than rediffing.
475
content._lines[j:j+n] = merge_content._lines[i:i+n]
478
reference_content = self._get_content(parents[0], parent_texts)
479
new_texts = content.text()
480
old_texts = reference_content.text()
481
delta_seq = SequenceMatcher(None, old_texts, new_texts)
482
return self._make_line_delta(delta_seq, content)
484
def _make_line_delta(self, delta_seq, new_content):
485
"""Generate a line delta from delta_seq and new_content."""
487
for op in delta_seq.get_opcodes():
490
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
493
def _get_components(self, version_id):
494
"""Return a list of (version_id, method, data) tuples that
495
makes up version specified by version_id of the knit.
497
The components should be applied in the order of the returned
500
The basis knit will be used to the largest extent possible
501
since it is assumed that accesses to it is faster.
504
# 4168 calls in 14912, 2289 internal
505
# 4168 in 9711 to read_records
506
# 52554 in 1250 to get_parents
507
# 170166 in 865 to list.append
509
# needed_revisions holds a list of (method, version_id) of
510
# versions that is needed to be fetched to construct the final
511
# version of the file.
513
# basis_revisions is a list of versions that needs to be
514
# fetched but exists in the basis knit.
516
basis = self.basis_knit
523
if basis and basis._index.has_version(cursor):
525
basis_versions.append(cursor)
526
method = picked_knit._index.get_method(cursor)
527
needed_versions.append((method, cursor))
528
if method == 'fulltext':
530
cursor = picked_knit.get_parents(cursor)[0]
535
for comp_id in basis_versions:
536
data_pos, data_size = basis._index.get_data_position(comp_id)
537
records.append((piece_id, data_pos, data_size))
538
components.update(basis._data.read_records(records))
541
for comp_id in [vid for method, vid in needed_versions
542
if vid not in basis_versions]:
543
data_pos, data_size = self._index.get_position(comp_id)
544
records.append((comp_id, data_pos, data_size))
545
components.update(self._data.read_records(records))
547
# get_data_records returns a mapping with the version id as
548
# index and the value as data. The order the components need
549
# to be applied is held by needed_versions (reversed).
551
for method, comp_id in reversed(needed_versions):
552
out.append((comp_id, method, components[comp_id]))
556
def _get_content(self, version_id, parent_texts={}):
557
"""Returns a content object that makes up the specified
559
if not self.has_version(version_id):
560
raise RevisionNotPresent(version_id, self.filename)
562
cached_version = parent_texts.get(version_id, None)
563
if cached_version is not None:
564
return cached_version
566
if self.basis_knit and version_id in self.basis_knit:
567
return self.basis_knit._get_content(version_id)
570
components = self._get_components(version_id)
571
for component_id, method, (data, digest) in components:
572
version_idx = self._index.lookup(component_id)
573
if method == 'fulltext':
574
assert content is None
575
content = self.factory.parse_fulltext(data, version_idx)
576
elif method == 'line-delta':
577
delta = self.factory.parse_line_delta(data, version_idx)
578
content._lines = self._apply_delta(content._lines, delta)
580
if 'no-eol' in self._index.get_options(version_id):
581
line = content._lines[-1][1].rstrip('\n')
582
content._lines[-1] = (content._lines[-1][0], line)
584
# digest here is the digest from the last applied component.
585
if sha_strings(content.text()) != digest:
586
import pdb;pdb.set_trace()
587
raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)
591
def _check_versions_present(self, version_ids):
592
"""Check that all specified versions are present."""
593
version_ids = set(version_ids)
594
for r in list(version_ids):
595
if self._index.has_version(r):
596
version_ids.remove(r)
598
raise RevisionNotPresent(list(version_ids)[0], self.filename)
600
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
601
"""See VersionedFile.add_lines_with_ghosts()."""
602
self._check_add(version_id, lines)
603
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
605
def _add_lines(self, version_id, parents, lines, parent_texts):
606
"""See VersionedFile.add_lines."""
607
self._check_add(version_id, lines)
608
self._check_versions_present(parents)
609
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
611
def _check_add(self, version_id, lines):
612
"""check that version_id and lines are safe to add."""
613
assert self.writable, "knit is not opened for write"
614
### FIXME escape. RBC 20060228
615
if contains_whitespace(version_id):
616
raise InvalidRevisionId(version_id)
617
if self.has_version(version_id):
618
raise RevisionAlreadyPresent(version_id, self.filename)
619
self._check_lines_not_unicode(lines)
620
self._check_lines_are_lines(lines)
622
def _add(self, version_id, lines, parents, delta, parent_texts):
623
"""Add a set of lines on top of version specified by parents.
625
If delta is true, compress the text as a line-delta against
628
Any versions not present will be converted into ghosts.
630
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
631
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
632
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
633
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
634
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
635
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
636
# +1383 0 8.0370 8.0370 +<len>
637
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
638
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
639
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
640
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
644
if parent_texts is None:
646
for parent in parents:
647
if not self.has_version(parent):
648
ghosts.append(parent)
650
present_parents.append(parent)
652
if delta and not len(present_parents):
655
digest = sha_strings(lines)
658
if lines[-1][-1] != '\n':
659
options.append('no-eol')
660
lines[-1] = lines[-1] + '\n'
662
if len(present_parents) and delta:
663
# To speed the extract of texts the delta chain is limited
664
# to a fixed number of deltas. This should minimize both
665
# I/O and the time spend applying deltas.
667
delta_parents = present_parents
669
parent = delta_parents[0]
670
method = self._index.get_method(parent)
671
if method == 'fulltext':
673
delta_parents = self._index.get_parents(parent)
675
if method == 'line-delta':
678
lines = self.factory.make(lines, version_id)
679
if delta or (self.factory.annotated and len(present_parents) > 0):
680
# Merge annotations from parent texts if so is needed.
681
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
682
delta, self.factory.annotated)
685
options.append('line-delta')
686
store_lines = self.factory.lower_line_delta(delta_hunks)
688
options.append('fulltext')
689
store_lines = self.factory.lower_fulltext(lines)
691
where, size = self._data.add_record(version_id, digest, store_lines)
692
self._index.add_version(version_id, options, where, size, parents)
695
def check(self, progress_bar=None):
696
"""See VersionedFile.check()."""
698
def _clone_text(self, new_version_id, old_version_id, parents):
699
"""See VersionedFile.clone_text()."""
700
# FIXME RBC 20060228 make fast by only inserting an index with null delta.
701
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
703
def get_lines(self, version_id):
704
"""See VersionedFile.get_lines()."""
705
return self._get_content(version_id).text()
707
def iter_lines_added_or_present_in_versions(self, version_ids=None):
708
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
709
if version_ids is None:
710
version_ids = self.versions()
711
# we dont care about inclusions, the caller cares.
712
# but we need to setup a list of records to visit.
713
# we need version_id, position, length
714
version_id_records = []
715
requested_versions = list(version_ids)
716
# filter for available versions
717
for version_id in requested_versions:
718
if not self.has_version(version_id):
719
raise RevisionNotPresent(version_id, self.filename)
720
# get a in-component-order queue:
722
for version_id in self.versions():
723
if version_id in requested_versions:
724
version_ids.append(version_id)
725
data_pos, length = self._index.get_position(version_id)
726
version_id_records.append((version_id, data_pos, length))
728
pb = bzrlib.ui.ui_factory.nested_progress_bar()
730
total = len(version_id_records)
732
pb.update('Walking content.', count, total)
733
for version_id, data, sha_value in \
734
self._data.read_records_iter(version_id_records):
735
pb.update('Walking content.', count, total)
736
method = self._index.get_method(version_id)
737
version_idx = self._index.lookup(version_id)
738
assert method in ('fulltext', 'line-delta')
739
if method == 'fulltext':
740
content = self.factory.parse_fulltext(data, version_idx)
741
for line in content.text():
744
delta = self.factory.parse_line_delta(data, version_idx)
745
for start, end, count, lines in delta:
746
for origin, line in lines:
749
pb.update('Walking content.', total, total)
752
pb.update('Walking content.', total, total)
756
def num_versions(self):
757
"""See VersionedFile.num_versions()."""
758
return self._index.num_versions()
760
__len__ = num_versions
762
def annotate_iter(self, version_id):
763
"""See VersionedFile.annotate_iter."""
764
content = self._get_content(version_id)
765
for origin, text in content.annotate_iter():
768
def get_parents(self, version_id):
769
"""See VersionedFile.get_parents."""
772
# 52554 calls in 1264 872 internal down from 3674
774
return self._index.get_parents(version_id)
776
raise RevisionNotPresent(version_id, self.filename)
778
def get_parents_with_ghosts(self, version_id):
779
"""See VersionedFile.get_parents."""
781
return self._index.get_parents_with_ghosts(version_id)
783
raise RevisionNotPresent(version_id, self.filename)
785
def get_ancestry(self, versions):
786
"""See VersionedFile.get_ancestry."""
787
if isinstance(versions, basestring):
788
versions = [versions]
791
self._check_versions_present(versions)
792
return self._index.get_ancestry(versions)
794
def get_ancestry_with_ghosts(self, versions):
795
"""See VersionedFile.get_ancestry_with_ghosts."""
796
if isinstance(versions, basestring):
797
versions = [versions]
800
self._check_versions_present(versions)
801
return self._index.get_ancestry_with_ghosts(versions)
803
#@deprecated_method(zero_eight)
804
def walk(self, version_ids):
805
"""See VersionedFile.walk."""
806
# We take the short path here, and extract all relevant texts
807
# and put them in a weave and let that do all the work. Far
808
# from optimal, but is much simpler.
809
# FIXME RB 20060228 this really is inefficient!
810
from bzrlib.weave import Weave
812
w = Weave(self.filename)
813
ancestry = self.get_ancestry(version_ids)
814
sorted_graph = topo_sort(self._index.get_graph())
815
version_list = [vid for vid in sorted_graph if vid in ancestry]
817
for version_id in version_list:
818
lines = self.get_lines(version_id)
819
w.add_lines(version_id, self.get_parents(version_id), lines)
821
for lineno, insert_id, dset, line in w.walk(version_ids):
822
yield lineno, insert_id, dset, line
824
def plan_merge(self, ver_a, ver_b):
825
"""See VersionedFile.plan_merge."""
826
ancestors_b = set(self.get_ancestry(ver_b))
827
def status_a(revision, text):
828
if revision in ancestors_b:
829
return 'killed-b', text
833
ancestors_a = set(self.get_ancestry(ver_a))
834
def status_b(revision, text):
835
if revision in ancestors_a:
836
return 'killed-a', text
840
annotated_a = self.annotate(ver_a)
841
annotated_b = self.annotate(ver_b)
842
plain_a = [t for (a, t) in annotated_a]
843
plain_b = [t for (a, t) in annotated_b]
844
blocks = SequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
847
for ai, bi, l in blocks:
848
# process all mismatched sections
849
# (last mismatched section is handled because blocks always
850
# includes a 0-length last block)
851
for revision, text in annotated_a[a_cur:ai]:
852
yield status_a(revision, text)
853
for revision, text in annotated_b[b_cur:bi]:
854
yield status_b(revision, text)
856
# and now the matched section
859
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
860
assert text_a == text_b
861
yield "unchanged", text_a
864
class _KnitComponentFile(object):
865
"""One of the files used to implement a knit database"""
867
def __init__(self, transport, filename, mode, file_mode=None):
868
self._transport = transport
869
self._filename = filename
871
self._file_mode=file_mode
873
def write_header(self):
874
if self._transport.append(self._filename, StringIO(self.HEADER),
875
mode=self._file_mode):
876
raise KnitCorrupt(self._filename, 'misaligned after writing header')
878
def check_header(self, fp):
880
if line != self.HEADER:
881
raise KnitHeaderError(badline=line)
884
"""Commit is a nop."""
887
return '%s(%s)' % (self.__class__.__name__, self._filename)
890
class _KnitIndex(_KnitComponentFile):
891
"""Manages knit index file.
893
The index is already kept in memory and read on startup, to enable
894
fast lookups of revision information. The cursor of the index
895
file is always pointing to the end, making it easy to append
898
_cache is a cache for fast mapping from version id to a Index
901
_history is a cache for fast mapping from indexes to version ids.
903
The index data format is dictionary compressed when it comes to
904
parent references; a index entry may only have parents that with a
905
lover index number. As a result, the index is topological sorted.
907
Duplicate entries may be written to the index for a single version id
908
if this is done then the latter one completely replaces the former:
909
this allows updates to correct version and parent information.
910
Note that the two entries may share the delta, and that successive
911
annotations and references MUST point to the first entry.
913
The index file on disc contains a header, followed by one line per knit
914
record. The same revision can be present in an index file more than once.
915
The first occurence gets assigned a sequence number starting from 0.
917
The format of a single line is
918
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
919
REVISION_ID is a utf8-encoded revision id
920
FLAGS is a comma separated list of flags about the record. Values include
921
no-eol, line-delta, fulltext.
922
BYTE_OFFSET is the ascii representation of the byte offset in the data file
923
that the the compressed data starts at.
924
LENGTH is the ascii representation of the length of the data file.
925
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
927
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
928
revision id already in the knit that is a parent of REVISION_ID.
929
The ' :' marker is the end of record marker.
932
when a write is interrupted to the index file, it will result in a line that
933
does not end in ' :'. If the ' :' is not present at the end of a line, or at
934
the end of the file, then the record that is missing it will be ignored by
937
When writing new records to the index file, the data is preceeded by '\n'
938
to ensure that records always start on new lines even if the last write was
939
interrupted. As a result its normal for the last line in the index to be
940
missing a trailing newline. One can be added with no harmful effects.
943
HEADER = "# bzr knit index 8\n"
945
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
946
# __slots__ = ['_cache', '_history', '_transport', '_filename']
948
def _cache_version(self, version_id, options, pos, size, parents):
949
"""Cache a version record in the history array and index cache.
951
This is inlined into __init__ for performance. KEEP IN SYNC.
952
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
955
# only want the _history index to reference the 1st index entry
957
if version_id not in self._cache:
958
index = len(self._history)
959
self._history.append(version_id)
961
index = self._cache[version_id][5]
962
self._cache[version_id] = (version_id,
969
def __init__(self, transport, filename, mode, create=False, file_mode=None):
970
_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)
972
# position in _history is the 'official' index for a revision
973
# but the values may have come from a newer entry.
974
# so - wc -l of a knit index is != the number of uniqe names
977
pb = bzrlib.ui.ui_factory.nested_progress_bar()
982
pb.update('read knit index', count, total)
983
fp = self._transport.get(self._filename)
984
self.check_header(fp)
985
# readlines reads the whole file at once:
986
# bad for transports like http, good for local disk
987
# we save 60 ms doing this one change (
988
# from calling readline each time to calling
990
# probably what we want for nice behaviour on
991
# http is a incremental readlines that yields, or
992
# a check for local vs non local indexes,
993
for l in fp.readlines():
995
if len(rec) < 5 or rec[-1] != ':':
997
# FIXME: in the future we should determine if its a
998
# short write - and ignore it
999
# or a different failure, and raise. RBC 20060407
1003
#pb.update('read knit index', count, total)
1004
# See self._parse_parents
1006
for value in rec[4:-1]:
1008
# uncompressed reference
1009
parents.append(value[1:])
1011
# this is 15/4000ms faster than isinstance,
1013
# this function is called thousands of times a
1014
# second so small variations add up.
1015
assert value.__class__ is str
1016
parents.append(self._history[int(value)])
1017
# end self._parse_parents
1018
# self._cache_version(rec[0],
1019
# rec[1].split(','),
1023
# --- self._cache_version
1024
# only want the _history index to reference the 1st
1025
# index entry for version_id
1027
if version_id not in self._cache:
1028
index = len(self._history)
1029
self._history.append(version_id)
1031
index = self._cache[version_id][5]
1032
self._cache[version_id] = (version_id,
1038
# --- self._cache_version
1039
except NoSuchFile, e:
1040
if mode != 'w' or not create:
1044
pb.update('read knit index', total, total)
1047
def _parse_parents(self, compressed_parents):
1048
"""convert a list of string parent values into version ids.
1050
ints are looked up in the index.
1051
.FOO values are ghosts and converted in to FOO.
1053
NOTE: the function is retained here for clarity, and for possible
1054
use in partial index reads. However bulk processing now has
1055
it inlined in __init__ for inner-loop optimisation.
1058
for value in compressed_parents:
1059
if value[-1] == '.':
1060
# uncompressed reference
1061
result.append(value[1:])
1063
# this is 15/4000ms faster than isinstance,
1064
# this function is called thousands of times a
1065
# second so small variations add up.
1066
assert value.__class__ is str
1067
result.append(self._history[int(value)])
1070
def get_graph(self):
1072
for version_id, index in self._cache.iteritems():
1073
graph.append((version_id, index[4]))
1076
def get_ancestry(self, versions):
1077
"""See VersionedFile.get_ancestry."""
1078
# get a graph of all the mentioned versions:
1080
pending = set(versions)
1082
version = pending.pop()
1083
parents = self._cache[version][4]
1084
# got the parents ok
1086
parents = [parent for parent in parents if parent in self._cache]
1087
for parent in parents:
1088
# if not completed and not a ghost
1089
if parent not in graph:
1091
graph[version] = parents
1092
return topo_sort(graph.items())
1094
def get_ancestry_with_ghosts(self, versions):
1095
"""See VersionedFile.get_ancestry_with_ghosts."""
1096
# get a graph of all the mentioned versions:
1098
pending = set(versions)
1100
version = pending.pop()
1102
parents = self._cache[version][4]
1108
# got the parents ok
1109
for parent in parents:
1110
if parent not in graph:
1112
graph[version] = parents
1113
return topo_sort(graph.items())
1115
def num_versions(self):
1116
return len(self._history)
1118
__len__ = num_versions
1120
def get_versions(self):
1121
return self._history
1123
def idx_to_name(self, idx):
1124
return self._history[idx]
1126
def lookup(self, version_id):
1127
assert version_id in self._cache
1128
return self._cache[version_id][5]
1130
def _version_list_to_index(self, versions):
1132
for version in versions:
1133
if version in self._cache:
1134
# -- inlined lookup() --
1135
result_list.append(str(self._cache[version][5]))
1136
# -- end lookup () --
1138
result_list.append('.' + version.encode('utf-8'))
1139
return ' '.join(result_list)
1141
def add_version(self, version_id, options, pos, size, parents):
1142
"""Add a version record to the index."""
1143
self._cache_version(version_id, options, pos, size, parents)
1145
content = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),
1149
self._version_list_to_index(parents))
1150
assert isinstance(content, str), 'content must be utf-8 encoded'
1151
self._transport.append(self._filename, StringIO(content))
1153
def has_version(self, version_id):
1154
"""True if the version is in the index."""
1155
return self._cache.has_key(version_id)
1157
def get_position(self, version_id):
1158
"""Return data position and size of specified version."""
1159
return (self._cache[version_id][2], \
1160
self._cache[version_id][3])
1162
def get_method(self, version_id):
1163
"""Return compression method of specified version."""
1164
options = self._cache[version_id][1]
1165
if 'fulltext' in options:
1168
assert 'line-delta' in options
1171
def get_options(self, version_id):
1172
return self._cache[version_id][1]
1174
def get_parents(self, version_id):
1175
"""Return parents of specified version ignoring ghosts."""
1176
return [parent for parent in self._cache[version_id][4]
1177
if parent in self._cache]
1179
def get_parents_with_ghosts(self, version_id):
1180
"""Return parents of specified version wth ghosts."""
1181
return self._cache[version_id][4]
1183
def check_versions_present(self, version_ids):
1184
"""Check that all specified versions are present."""
1185
version_ids = set(version_ids)
1186
for version_id in list(version_ids):
1187
if version_id in self._cache:
1188
version_ids.remove(version_id)
1190
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1193
class _KnitData(_KnitComponentFile):
1194
"""Contents of the knit data file"""
1196
HEADER = "# bzr knit data 8\n"
1198
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1199
_KnitComponentFile.__init__(self, transport, filename, mode)
1201
self._checked = False
1203
self._transport.put(self._filename, StringIO(''), mode=file_mode)
1206
def clear_cache(self):
1207
"""Clear the record cache."""
1210
def _open_file(self):
1211
if self._file is None:
1213
self._file = self._transport.get(self._filename)
1218
def _record_to_data(self, version_id, digest, lines):
1219
"""Convert version_id, digest, lines into a raw data block.
1221
:return: (len, a StringIO instance with the raw data ready to read.)
1224
data_file = GzipFile(None, mode='wb', fileobj=sio)
1225
data_file.writelines(chain(
1226
["version %s %d %s\n" % (version_id.encode('utf-8'),
1230
["end %s\n" % version_id.encode('utf-8')]))
1237
def add_raw_record(self, raw_data):
1238
"""Append a prepared record to the data file."""
1239
assert isinstance(raw_data, str), 'data must be plain bytes'
1240
start_pos = self._transport.append(self._filename, StringIO(raw_data))
1241
return start_pos, len(raw_data)
1243
def add_record(self, version_id, digest, lines):
1244
"""Write new text record to disk. Returns the position in the
1245
file where it was written."""
1246
size, sio = self._record_to_data(version_id, digest, lines)
1248
self._records[version_id] = (digest, lines)
1250
start_pos = self._transport.append(self._filename, sio)
1251
return start_pos, size
1253
def _parse_record_header(self, version_id, raw_data):
1254
"""Parse a record header for consistency.
1256
:return: the header and the decompressor stream.
1257
as (stream, header_record)
1259
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1260
rec = df.readline().split()
1262
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1263
if rec[1].decode('utf-8')!= version_id:
1264
raise KnitCorrupt(self._filename,
1265
'unexpected version, wanted %r, got %r' % (
1266
version_id, rec[1]))
1269
def _parse_record(self, version_id, data):
1271
# 4168 calls in 2880 217 internal
1272
# 4168 calls to _parse_record_header in 2121
1273
# 4168 calls to readlines in 330
1274
df, rec = self._parse_record_header(version_id, data)
1275
record_contents = df.readlines()
1276
l = record_contents.pop()
1277
assert len(record_contents) == int(rec[2])
1278
if l.decode('utf-8') != 'end %s\n' % version_id:
1279
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1282
return record_contents, rec[3]
1284
def read_records_iter_raw(self, records):
1285
"""Read text records from data file and yield raw data.
1287
This unpacks enough of the text record to validate the id is
1288
as expected but thats all.
1290
It will actively recompress currently cached records on the
1291
basis that that is cheaper than I/O activity.
1294
for version_id, pos, size in records:
1295
if version_id not in self._records:
1296
needed_records.append((version_id, pos, size))
1298
# setup an iterator of the external records:
1299
# uses readv so nice and fast we hope.
1300
if len(needed_records):
1301
# grab the disk data needed.
1302
raw_records = self._transport.readv(self._filename,
1303
[(pos, size) for version_id, pos, size in needed_records])
1305
for version_id, pos, size in records:
1306
if version_id in self._records:
1307
# compress a new version
1308
size, sio = self._record_to_data(version_id,
1309
self._records[version_id][0],
1310
self._records[version_id][1])
1311
yield version_id, sio.getvalue()
1313
pos, data = raw_records.next()
1314
# validate the header
1315
df, rec = self._parse_record_header(version_id, data)
1317
yield version_id, data
1320
def read_records_iter(self, records):
1321
"""Read text records from data file and yield result.
1323
Each passed record is a tuple of (version_id, pos, len) and
1324
will be read in the given order. Yields (version_id,
1328
# 60890 calls for 4168 extractions in 5045, 683 internal.
1329
# 4168 calls to readv in 1411
1330
# 4168 calls to parse_record in 2880
1333
for version_id, pos, size in records:
1334
if version_id not in self._records:
1335
needed_records.append((version_id, pos, size))
1337
if len(needed_records):
1338
# We take it that the transport optimizes the fetching as good
1339
# as possible (ie, reads continous ranges.)
1340
response = self._transport.readv(self._filename,
1341
[(pos, size) for version_id, pos, size in needed_records])
1343
for (record_id, pos, size), (pos, data) in izip(iter(needed_records), response):
1344
content, digest = self._parse_record(record_id, data)
1345
self._records[record_id] = (digest, content)
1347
for version_id, pos, size in records:
1348
yield version_id, list(self._records[version_id][1]), self._records[version_id][0]
1350
def read_records(self, records):
1351
"""Read records into a dictionary."""
1353
for record_id, content, digest in self.read_records_iter(records):
1354
components[record_id] = (content, digest)
1358
class InterKnit(InterVersionedFile):
1359
"""Optimised code paths for knit to knit operations."""
1361
_matching_file_from_factory = KnitVersionedFile
1362
_matching_file_to_factory = KnitVersionedFile
1365
def is_compatible(source, target):
1366
"""Be compatible with knits. """
1368
return (isinstance(source, KnitVersionedFile) and
1369
isinstance(target, KnitVersionedFile))
1370
except AttributeError:
1373
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1374
"""See InterVersionedFile.join."""
1375
assert isinstance(self.source, KnitVersionedFile)
1376
assert isinstance(self.target, KnitVersionedFile)
1378
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1383
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1385
version_ids = list(version_ids)
1386
if None in version_ids:
1387
version_ids.remove(None)
1389
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1390
this_versions = set(self.target._index.get_versions())
1391
needed_versions = self.source_ancestry - this_versions
1392
cross_check_versions = self.source_ancestry.intersection(this_versions)
1393
mismatched_versions = set()
1394
for version in cross_check_versions:
1395
# scan to include needed parents.
1396
n1 = set(self.target.get_parents_with_ghosts(version))
1397
n2 = set(self.source.get_parents_with_ghosts(version))
1399
# FIXME TEST this check for cycles being introduced works
1400
# the logic is we have a cycle if in our graph we are an
1401
# ancestor of any of the n2 revisions.
1407
parent_ancestors = self.source.get_ancestry(parent)
1408
if version in parent_ancestors:
1409
raise errors.GraphCycleError([parent, version])
1410
# ensure this parent will be available later.
1411
new_parents = n2.difference(n1)
1412
needed_versions.update(new_parents.difference(this_versions))
1413
mismatched_versions.add(version)
1415
if not needed_versions and not mismatched_versions:
1417
full_list = topo_sort(self.source.get_graph())
1419
version_list = [i for i in full_list if (not self.target.has_version(i)
1420
and i in needed_versions)]
1424
copy_queue_records = []
1426
for version_id in version_list:
1427
options = self.source._index.get_options(version_id)
1428
parents = self.source._index.get_parents_with_ghosts(version_id)
1429
# check that its will be a consistent copy:
1430
for parent in parents:
1431
# if source has the parent, we must :
1432
# * already have it or
1433
# * have it scheduled already
1434
# otherwise we dont care
1435
assert (self.target.has_version(parent) or
1436
parent in copy_set or
1437
not self.source.has_version(parent))
1438
data_pos, data_size = self.source._index.get_position(version_id)
1439
copy_queue_records.append((version_id, data_pos, data_size))
1440
copy_queue.append((version_id, options, parents))
1441
copy_set.add(version_id)
1443
# data suck the join:
1445
total = len(version_list)
1446
# we want the raw gzip for bulk copying, but the record validated
1447
# just enough to be sure its the right one.
1448
# TODO: consider writev or write combining to reduce
1449
# death of a thousand cuts feeling.
1450
for (version_id, raw_data), \
1451
(version_id2, options, parents) in \
1452
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1454
assert version_id == version_id2, 'logic error, inconsistent results'
1456
pb.update("Joining knit", count, total)
1457
pos, size = self.target._data.add_raw_record(raw_data)
1458
self.target._index.add_version(version_id, options, pos, size, parents)
1460
for version in mismatched_versions:
1461
# FIXME RBC 20060309 is this needed?
1462
n1 = set(self.target.get_parents_with_ghosts(version))
1463
n2 = set(self.source.get_parents_with_ghosts(version))
1464
# write a combined record to our history preserving the current
1465
# parents as first in the list
1466
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1467
self.target.fix_parents(version, new_parents)
1473
InterVersionedFile.register_optimiser(InterKnit)
1476
class WeaveToKnit(InterVersionedFile):
1477
"""Optimised code paths for weave to knit operations."""
1479
_matching_file_from_factory = bzrlib.weave.WeaveFile
1480
_matching_file_to_factory = KnitVersionedFile
1483
def is_compatible(source, target):
1484
"""Be compatible with weaves to knits."""
1486
return (isinstance(source, bzrlib.weave.Weave) and
1487
isinstance(target, KnitVersionedFile))
1488
except AttributeError:
1491
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1492
"""See InterVersionedFile.join."""
1493
assert isinstance(self.source, bzrlib.weave.Weave)
1494
assert isinstance(self.target, KnitVersionedFile)
1496
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1501
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1503
version_ids = list(version_ids)
1505
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1506
this_versions = set(self.target._index.get_versions())
1507
needed_versions = self.source_ancestry - this_versions
1508
cross_check_versions = self.source_ancestry.intersection(this_versions)
1509
mismatched_versions = set()
1510
for version in cross_check_versions:
1511
# scan to include needed parents.
1512
n1 = set(self.target.get_parents_with_ghosts(version))
1513
n2 = set(self.source.get_parents(version))
1514
# if all of n2's parents are in n1, then its fine.
1515
if n2.difference(n1):
1516
# FIXME TEST this check for cycles being introduced works
1517
# the logic is we have a cycle if in our graph we are an
1518
# ancestor of any of the n2 revisions.
1524
parent_ancestors = self.source.get_ancestry(parent)
1525
if version in parent_ancestors:
1526
raise errors.GraphCycleError([parent, version])
1527
# ensure this parent will be available later.
1528
new_parents = n2.difference(n1)
1529
needed_versions.update(new_parents.difference(this_versions))
1530
mismatched_versions.add(version)
1532
if not needed_versions and not mismatched_versions:
1534
full_list = topo_sort(self.source.get_graph())
1536
version_list = [i for i in full_list if (not self.target.has_version(i)
1537
and i in needed_versions)]
1541
total = len(version_list)
1542
for version_id in version_list:
1543
pb.update("Converting to knit", count, total)
1544
parents = self.source.get_parents(version_id)
1545
# check that its will be a consistent copy:
1546
for parent in parents:
1547
# if source has the parent, we must already have it
1548
assert (self.target.has_version(parent))
1549
self.target.add_lines(
1550
version_id, parents, self.source.get_lines(version_id))
1553
for version in mismatched_versions:
1554
# FIXME RBC 20060309 is this needed?
1555
n1 = set(self.target.get_parents_with_ghosts(version))
1556
n2 = set(self.source.get_parents(version))
1557
# write a combined record to our history preserving the current
1558
# parents as first in the list
1559
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1560
self.target.fix_parents(version, new_parents)
1566
InterVersionedFile.register_optimiser(WeaveToKnit)
1569
class SequenceMatcher(difflib.SequenceMatcher):
1570
"""Knit tuned sequence matcher.
1572
This is based on profiling of difflib which indicated some improvements
1573
for our usage pattern.
1576
def find_longest_match(self, alo, ahi, blo, bhi):
1577
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1579
If isjunk is not defined:
1581
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1582
alo <= i <= i+k <= ahi
1583
blo <= j <= j+k <= bhi
1584
and for all (i',j',k') meeting those conditions,
1587
and if i == i', j <= j'
1589
In other words, of all maximal matching blocks, return one that
1590
starts earliest in a, and of all those maximal matching blocks that
1591
start earliest in a, return the one that starts earliest in b.
1593
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1594
>>> s.find_longest_match(0, 5, 0, 9)
1597
If isjunk is defined, first the longest matching block is
1598
determined as above, but with the additional restriction that no
1599
junk element appears in the block. Then that block is extended as
1600
far as possible by matching (only) junk elements on both sides. So
1601
the resulting block never matches on junk except as identical junk
1602
happens to be adjacent to an "interesting" match.
1604
Here's the same example as before, but considering blanks to be
1605
junk. That prevents " abcd" from matching the " abcd" at the tail
1606
end of the second sequence directly. Instead only the "abcd" can
1607
match, and matches the leftmost "abcd" in the second sequence:
1609
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1610
>>> s.find_longest_match(0, 5, 0, 9)
1613
If no blocks match, return (alo, blo, 0).
1615
>>> s = SequenceMatcher(None, "ab", "c")
1616
>>> s.find_longest_match(0, 2, 0, 1)
1620
# CAUTION: stripping common prefix or suffix would be incorrect.
1624
# Longest matching block is "ab", but if common prefix is
1625
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1626
# strip, so ends up claiming that ab is changed to acab by
1627
# inserting "ca" in the middle. That's minimal but unintuitive:
1628
# "it's obvious" that someone inserted "ac" at the front.
1629
# Windiff ends up at the same place as diff, but by pairing up
1630
# the unique 'b's and then matching the first two 'a's.
1632
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1633
besti, bestj, bestsize = alo, blo, 0
1634
# find longest junk-free match
1635
# during an iteration of the loop, j2len[j] = length of longest
1636
# junk-free match ending with a[i-1] and b[j]
1640
for i in xrange(alo, ahi):
1641
# look at all instances of a[i] in b; note that because
1642
# b2j has no junk keys, the loop is skipped if a[i] is junk
1643
j2lenget = j2len.get
1646
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1647
# following improvement
1648
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1649
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1650
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1652
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1653
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1654
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1666
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1668
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1671
# Extend the best by non-junk elements on each end. In particular,
1672
# "popular" non-junk elements aren't in b2j, which greatly speeds
1673
# the inner loop above, but also means "the best" match so far
1674
# doesn't contain any junk *or* popular non-junk elements.
1675
while besti > alo and bestj > blo and \
1676
not isbjunk(b[bestj-1]) and \
1677
a[besti-1] == b[bestj-1]:
1678
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1679
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1680
not isbjunk(b[bestj+bestsize]) and \
1681
a[besti+bestsize] == b[bestj+bestsize]:
1684
# Now that we have a wholly interesting match (albeit possibly
1685
# empty!), we may as well suck up the matching junk on each
1686
# side of it too. Can't think of a good reason not to, and it
1687
# saves post-processing the (possibly considerable) expense of
1688
# figuring out what to do with it. In the case of an empty
1689
# interesting match, this is clearly the right thing to do,
1690
# because no other kind of match is possible in the regions.
1691
while besti > alo and bestj > blo and \
1692
isbjunk(b[bestj-1]) and \
1693
a[besti-1] == b[bestj-1]:
1694
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1695
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1696
isbjunk(b[bestj+bestsize]) and \
1697
a[besti+bestsize] == b[bestj+bestsize]:
1698
bestsize = bestsize + 1
1700
return besti, bestj, bestsize