116
115
"""Return a list of (origin, text) tuples."""
117
116
return list(self.annotate_iter())
119
def apply_delta(self, delta):
120
"""Apply delta to this content."""
122
for start, end, count, lines in delta:
123
self._lines[offset+start:offset+end] = lines
124
offset = offset + (start - end) + count
126
118
def line_delta_iter(self, new_lines):
127
"""Generate line-based delta from new_lines to this content."""
119
"""Generate line-based delta from this content to new_lines."""
128
120
new_texts = [text for origin, text in new_lines._lines]
129
121
old_texts = [text for origin, text in self._lines]
130
s = difflib.SequenceMatcher(None, old_texts, new_texts)
122
s = SequenceMatcher(None, old_texts, new_texts)
131
123
for op in s.get_opcodes():
132
124
if op[0] == 'equal':
126
# ofrom oto length data
134
127
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
136
129
def line_delta(self, new_lines):
177
174
internal represnetation is
178
175
(start, end, count, [1..count tuples (revid, newline)])
181
header = lines.pop(0)
182
start, end, c = [int(n) for n in header.split(',')]
180
# walk through the lines parsing.
182
start, end, count = [int(n) for n in header.split(',')]
185
origin, text = lines.pop(0).split(' ', 1)
186
origin, text = next().split(' ', 1)
186
188
contents.append((origin.decode('utf-8'), text))
187
yield start, end, c, contents
189
def parse_line_delta(self, lines, version):
190
return list(self.parse_line_delta_iter(lines))
189
result.append((start, end, count, contents))
192
192
def lower_fulltext(self, content):
193
193
"""convert a fulltext content record into a serializable form.
285
285
self.delta = delta
287
287
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
288
access_mode, create=create)
288
access_mode, create=create, file_mode=file_mode)
289
289
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
290
access_mode, create=not len(self.versions()))
290
access_mode, create=create and not len(self), file_mode=file_mode)
292
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
293
"""See VersionedFile._add_delta()."""
294
self._check_add(version_id, []) # should we check the lines ?
295
self._check_versions_present(parents)
299
for parent in parents:
300
if not self.has_version(parent):
301
ghosts.append(parent)
303
present_parents.append(parent)
305
if delta_parent is None:
306
# reconstitute as full text.
307
assert len(delta) == 1 or len(delta) == 0
309
assert delta[0][0] == 0
310
assert delta[0][1] == 0, delta[0][1]
311
return super(KnitVersionedFile, self)._add_delta(version_id,
322
options.append('no-eol')
324
if delta_parent is not None:
325
# determine the current delta chain length.
326
# To speed the extract of texts the delta chain is limited
327
# to a fixed number of deltas. This should minimize both
328
# I/O and the time spend applying deltas.
330
delta_parents = [delta_parent]
332
parent = delta_parents[0]
333
method = self._index.get_method(parent)
334
if method == 'fulltext':
336
delta_parents = self._index.get_parents(parent)
338
if method == 'line-delta':
339
# did not find a fulltext in the delta limit.
340
# just do a normal insertion.
341
return super(KnitVersionedFile, self)._add_delta(version_id,
348
options.append('line-delta')
349
store_lines = self.factory.lower_line_delta(delta)
351
where, size = self._data.add_record(version_id, digest, store_lines)
352
self._index.add_version(version_id, options, where, size, parents)
292
354
def clear_cache(self):
293
355
"""Clear the data cache only."""
322
384
current_values[3],
387
def get_delta(self, version_id):
388
"""Get a delta for constructing version from some other version."""
389
if not self.has_version(version_id):
390
raise RevisionNotPresent(version_id, self.filename)
392
parents = self.get_parents(version_id)
397
data_pos, data_size = self._index.get_position(version_id)
398
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
399
version_idx = self._index.lookup(version_id)
400
noeol = 'no-eol' in self._index.get_options(version_id)
401
if 'fulltext' == self._index.get_method(version_id):
402
new_content = self.factory.parse_fulltext(data, version_idx)
403
if parent is not None:
404
reference_content = self._get_content(parent)
405
old_texts = reference_content.text()
408
new_texts = new_content.text()
409
delta_seq = SequenceMatcher(None, old_texts, new_texts)
410
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
412
delta = self.factory.parse_line_delta(data, version_idx)
413
return parent, sha1, noeol, delta
325
415
def get_graph_with_ghosts(self):
326
416
"""See VersionedFile.get_graph_with_ghosts()."""
327
417
graph_items = self._index.get_graph()
328
418
return dict(graph_items)
420
def get_sha1(self, version_id):
421
"""See VersionedFile.get_sha1()."""
422
components = self._get_components(version_id)
423
return components[-1][-1][-1]
331
426
def get_suffixes():
332
427
"""See VersionedFile.get_suffixes()."""
357
452
__contains__ = has_version
359
def _merge_annotations(self, content, parents):
454
def _merge_annotations(self, content, parents, parent_texts={},
455
delta=None, annotated=None):
360
456
"""Merge annotations for content. This is done by comparing
361
the annotations based on changed to the text."""
362
for parent_id in parents:
363
merge_content = self._get_content(parent_id)
364
seq = SequenceMatcher(None, merge_content.text(), content.text())
365
for i, j, n in seq.get_matching_blocks():
368
content._lines[j:j+n] = merge_content._lines[i:i+n]
457
the annotations based on changed to the text.
461
for parent_id in parents:
462
merge_content = self._get_content(parent_id, parent_texts)
463
seq = SequenceMatcher(None, merge_content.text(), content.text())
464
if delta_seq is None:
465
# setup a delta seq to reuse.
467
for i, j, n in seq.get_matching_blocks():
470
# this appears to copy (origin, text) pairs across to the new
471
# content for any line that matches the last-checked parent.
472
# FIXME: save the sequence control data for delta compression
473
# against the most relevant parent rather than rediffing.
474
content._lines[j:j+n] = merge_content._lines[i:i+n]
477
reference_content = self._get_content(parents[0], parent_texts)
478
new_texts = content.text()
479
old_texts = reference_content.text()
480
delta_seq = SequenceMatcher(None, old_texts, new_texts)
481
return self._make_line_delta(delta_seq, content)
483
def _make_line_delta(self, delta_seq, new_content):
484
"""Generate a line delta from delta_seq and new_content."""
486
for op in delta_seq.get_opcodes():
489
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
370
492
def _get_components(self, version_id):
371
493
"""Return a list of (version_id, method, data) tuples that
463
597
raise RevisionNotPresent(list(version_ids)[0], self.filename)
465
def _add_lines_with_ghosts(self, version_id, parents, lines):
599
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
466
600
"""See VersionedFile.add_lines_with_ghosts()."""
467
601
self._check_add(version_id, lines)
468
return self._add(version_id, lines[:], parents, self.delta)
602
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
470
def _add_lines(self, version_id, parents, lines):
604
def _add_lines(self, version_id, parents, lines, parent_texts):
471
605
"""See VersionedFile.add_lines."""
472
606
self._check_add(version_id, lines)
473
607
self._check_versions_present(parents)
474
return self._add(version_id, lines[:], parents, self.delta)
608
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
476
610
def _check_add(self, version_id, lines):
477
611
"""check that version_id and lines are safe to add."""
667
820
for lineno, insert_id, dset, line in w.walk(version_ids):
668
821
yield lineno, insert_id, dset, line
823
def plan_merge(self, ver_a, ver_b):
824
"""See VersionedFile.plan_merge."""
825
ancestors_b = set(self.get_ancestry(ver_b))
826
def status_a(revision, text):
827
if revision in ancestors_b:
828
return 'killed-b', text
832
ancestors_a = set(self.get_ancestry(ver_a))
833
def status_b(revision, text):
834
if revision in ancestors_a:
835
return 'killed-a', text
839
annotated_a = self.annotate(ver_a)
840
annotated_b = self.annotate(ver_b)
841
plain_a = [t for (a, t) in annotated_a]
842
plain_b = [t for (a, t) in annotated_b]
843
blocks = SequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
846
for ai, bi, l in blocks:
847
# process all mismatched sections
848
# (last mismatched section is handled because blocks always
849
# includes a 0-length last block)
850
for revision, text in annotated_a[a_cur:ai]:
851
yield status_a(revision, text)
852
for revision, text in annotated_b[b_cur:bi]:
853
yield status_b(revision, text)
855
# and now the matched section
858
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
859
assert text_a == text_b
860
yield "unchanged", text_a
671
863
class _KnitComponentFile(object):
672
864
"""One of the files used to implement a knit database"""
674
def __init__(self, transport, filename, mode):
866
def __init__(self, transport, filename, mode, file_mode=None):
675
867
self._transport = transport
676
868
self._filename = filename
677
869
self._mode = mode
870
self._file_mode=file_mode
679
872
def write_header(self):
680
old_len = self._transport.append(self._filename, StringIO(self.HEADER))
873
if self._transport.append(self._filename, StringIO(self.HEADER),
874
mode=self._file_mode):
682
875
raise KnitCorrupt(self._filename, 'misaligned after writing header')
684
877
def check_header(self, fp):
685
line = fp.read(len(self.HEADER))
686
879
if line != self.HEADER:
687
880
raise KnitHeaderError(badline=line)
715
908
this allows updates to correct version and parent information.
716
909
Note that the two entries may share the delta, and that successive
717
910
annotations and references MUST point to the first entry.
912
The index file on disc contains a header, followed by one line per knit
913
record. The same revision can be present in an index file more than once.
914
The first occurence gets assigned a sequence number starting from 0.
916
The format of a single line is
917
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
918
REVISION_ID is a utf8-encoded revision id
919
FLAGS is a comma separated list of flags about the record. Values include
920
no-eol, line-delta, fulltext.
921
BYTE_OFFSET is the ascii representation of the byte offset in the data file
922
that the the compressed data starts at.
923
LENGTH is the ascii representation of the length of the data file.
924
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
926
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
927
revision id already in the knit that is a parent of REVISION_ID.
928
The ' :' marker is the end of record marker.
931
when a write is interrupted to the index file, it will result in a line that
932
does not end in ' :'. If the ' :' is not present at the end of a line, or at
933
the end of the file, then the record that is missing it will be ignored by
936
When writing new records to the index file, the data is preceeded by '\n'
937
to ensure that records always start on new lines even if the last write was
938
interrupted. As a result its normal for the last line in the index to be
939
missing a trailing newline. One can be added with no harmful effects.
720
HEADER = "# bzr knit index 7\n"
942
HEADER = "# bzr knit index 8\n"
944
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
945
# __slots__ = ['_cache', '_history', '_transport', '_filename']
722
947
def _cache_version(self, version_id, options, pos, size, parents):
723
val = (version_id, options, pos, size, parents)
724
self._cache[version_id] = val
725
if not version_id in self._history:
948
"""Cache a version record in the history array and index cache.
950
This is inlined into __init__ for performance. KEEP IN SYNC.
951
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
954
# only want the _history index to reference the 1st index entry
956
if version_id not in self._cache:
957
index = len(self._history)
726
958
self._history.append(version_id)
728
def _iter_index(self, fp):
734
#for l in lines.splitlines(False):
737
def __init__(self, transport, filename, mode, create=False):
738
_KnitComponentFile.__init__(self, transport, filename, mode)
960
index = self._cache[version_id][5]
961
self._cache[version_id] = (version_id,
968
def __init__(self, transport, filename, mode, create=False, file_mode=None):
969
_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)
740
971
# position in _history is the 'official' index for a revision
741
972
# but the values may have come from a newer entry.
750
981
pb.update('read knit index', count, total)
751
982
fp = self._transport.get(self._filename)
752
983
self.check_header(fp)
753
for rec in self._iter_index(fp):
984
# readlines reads the whole file at once:
985
# bad for transports like http, good for local disk
986
# we save 60 ms doing this one change (
987
# from calling readline each time to calling
989
# probably what we want for nice behaviour on
990
# http is a incremental readlines that yields, or
991
# a check for local vs non local indexes,
992
for l in fp.readlines():
994
if len(rec) < 5 or rec[-1] != ':':
996
# FIXME: in the future we should determine if its a
997
# short write - and ignore it
998
# or a different failure, and raise. RBC 20060407
756
pb.update('read knit index', count, total)
757
parents = self._parse_parents(rec[4:])
758
self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),
1002
#pb.update('read knit index', count, total)
1003
# See self._parse_parents
1005
for value in rec[4:-1]:
1007
# uncompressed reference
1008
parents.append(value[1:])
1010
# this is 15/4000ms faster than isinstance,
1012
# this function is called thousands of times a
1013
# second so small variations add up.
1014
assert value.__class__ is str
1015
parents.append(self._history[int(value)])
1016
# end self._parse_parents
1017
# self._cache_version(rec[0],
1018
# rec[1].split(','),
1022
# --- self._cache_version
1023
# only want the _history index to reference the 1st
1024
# index entry for version_id
1026
if version_id not in self._cache:
1027
index = len(self._history)
1028
self._history.append(version_id)
1030
index = self._cache[version_id][5]
1031
self._cache[version_id] = (version_id,
1037
# --- self._cache_version
760
1038
except NoSuchFile, e:
761
1039
if mode != 'w' or not create:
976
1268
def _parse_record(self, version_id, data):
1270
# 4168 calls in 2880 217 internal
1271
# 4168 calls to _parse_record_header in 2121
1272
# 4168 calls to readlines in 330
977
1273
df, rec = self._parse_record_header(version_id, data)
979
record_contents = self._read_record_contents(df, lines)
1274
record_contents = df.readlines()
1275
l = record_contents.pop()
1276
assert len(record_contents) == int(rec[2])
981
1277
if l.decode('utf-8') != 'end %s\n' % version_id:
982
1278
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
983
1279
% (l, version_id))
985
1281
return record_contents, rec[3]
987
def _read_record_contents(self, df, record_lines):
988
"""Read and return n lines from datafile."""
990
for i in range(record_lines):
991
r.append(df.readline())
994
1283
def read_records_iter_raw(self, records):
995
1284
"""Read text records from data file and yield raw data.
1179
1472
self.target.fix_parents(version, new_parents)
1186
1478
InterVersionedFile.register_optimiser(InterKnit)
1481
class SequenceMatcher(difflib.SequenceMatcher):
1482
"""Knit tuned sequence matcher.
1484
This is based on profiling of difflib which indicated some improvements
1485
for our usage pattern.
1488
def find_longest_match(self, alo, ahi, blo, bhi):
1489
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1491
If isjunk is not defined:
1493
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1494
alo <= i <= i+k <= ahi
1495
blo <= j <= j+k <= bhi
1496
and for all (i',j',k') meeting those conditions,
1499
and if i == i', j <= j'
1501
In other words, of all maximal matching blocks, return one that
1502
starts earliest in a, and of all those maximal matching blocks that
1503
start earliest in a, return the one that starts earliest in b.
1505
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1506
>>> s.find_longest_match(0, 5, 0, 9)
1509
If isjunk is defined, first the longest matching block is
1510
determined as above, but with the additional restriction that no
1511
junk element appears in the block. Then that block is extended as
1512
far as possible by matching (only) junk elements on both sides. So
1513
the resulting block never matches on junk except as identical junk
1514
happens to be adjacent to an "interesting" match.
1516
Here's the same example as before, but considering blanks to be
1517
junk. That prevents " abcd" from matching the " abcd" at the tail
1518
end of the second sequence directly. Instead only the "abcd" can
1519
match, and matches the leftmost "abcd" in the second sequence:
1521
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1522
>>> s.find_longest_match(0, 5, 0, 9)
1525
If no blocks match, return (alo, blo, 0).
1527
>>> s = SequenceMatcher(None, "ab", "c")
1528
>>> s.find_longest_match(0, 2, 0, 1)
1532
# CAUTION: stripping common prefix or suffix would be incorrect.
1536
# Longest matching block is "ab", but if common prefix is
1537
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1538
# strip, so ends up claiming that ab is changed to acab by
1539
# inserting "ca" in the middle. That's minimal but unintuitive:
1540
# "it's obvious" that someone inserted "ac" at the front.
1541
# Windiff ends up at the same place as diff, but by pairing up
1542
# the unique 'b's and then matching the first two 'a's.
1544
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1545
besti, bestj, bestsize = alo, blo, 0
1546
# find longest junk-free match
1547
# during an iteration of the loop, j2len[j] = length of longest
1548
# junk-free match ending with a[i-1] and b[j]
1552
for i in xrange(alo, ahi):
1553
# look at all instances of a[i] in b; note that because
1554
# b2j has no junk keys, the loop is skipped if a[i] is junk
1555
j2lenget = j2len.get
1558
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1559
# following improvement
1560
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1561
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1562
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1564
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1565
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1566
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1578
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1580
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1583
# Extend the best by non-junk elements on each end. In particular,
1584
# "popular" non-junk elements aren't in b2j, which greatly speeds
1585
# the inner loop above, but also means "the best" match so far
1586
# doesn't contain any junk *or* popular non-junk elements.
1587
while besti > alo and bestj > blo and \
1588
not isbjunk(b[bestj-1]) and \
1589
a[besti-1] == b[bestj-1]:
1590
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1591
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1592
not isbjunk(b[bestj+bestsize]) and \
1593
a[besti+bestsize] == b[bestj+bestsize]:
1596
# Now that we have a wholly interesting match (albeit possibly
1597
# empty!), we may as well suck up the matching junk on each
1598
# side of it too. Can't think of a good reason not to, and it
1599
# saves post-processing the (possibly considerable) expense of
1600
# figuring out what to do with it. In the case of an empty
1601
# interesting match, this is clearly the right thing to do,
1602
# because no other kind of match is possible in the regions.
1603
while besti > alo and bestj > blo and \
1604
isbjunk(b[bestj-1]) and \
1605
a[besti-1] == b[bestj-1]:
1606
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1607
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1608
isbjunk(b[bestj+bestsize]) and \
1609
a[besti+bestsize] == b[bestj+bestsize]:
1610
bestsize = bestsize + 1
1612
return besti, bestj, bestsize