289
282
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
290
283
access_mode, create=not len(self.versions()))
285
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
286
"""See VersionedFile._add_delta()."""
287
self._check_add(version_id, []) # should we check the lines ?
288
self._check_versions_present(parents)
292
for parent in parents:
293
if not self.has_version(parent):
294
ghosts.append(parent)
296
present_parents.append(parent)
298
if delta_parent is None:
299
# reconstitute as full text.
300
assert len(delta) == 1 or len(delta) == 0
302
assert delta[0][0] == 0
303
assert delta[0][1] == 0, delta[0][1]
304
return super(KnitVersionedFile, self)._add_delta(version_id,
315
options.append('no-eol')
317
if delta_parent is not None:
318
# determine the current delta chain length.
319
# To speed the extract of texts the delta chain is limited
320
# to a fixed number of deltas. This should minimize both
321
# I/O and the time spend applying deltas.
323
delta_parents = [delta_parent]
325
parent = delta_parents[0]
326
method = self._index.get_method(parent)
327
if method == 'fulltext':
329
delta_parents = self._index.get_parents(parent)
331
if method == 'line-delta':
332
# did not find a fulltext in the delta limit.
333
# just do a normal insertion.
334
return super(KnitVersionedFile, self)._add_delta(version_id,
341
options.append('line-delta')
342
store_lines = self.factory.lower_line_delta(delta)
344
where, size = self._data.add_record(version_id, digest, store_lines)
345
self._index.add_version(version_id, options, where, size, parents)
292
347
def clear_cache(self):
293
348
"""Clear the data cache only."""
294
349
self._data.clear_cache()
322
377
current_values[3],
380
def get_delta(self, version_id):
381
"""Get a delta for constructing version from some other version."""
382
if not self.has_version(version_id):
383
raise RevisionNotPresent(version_id, self.filename)
385
parents = self.get_parents(version_id)
390
data_pos, data_size = self._index.get_position(version_id)
391
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
392
version_idx = self._index.lookup(version_id)
393
noeol = 'no-eol' in self._index.get_options(version_id)
394
if 'fulltext' == self._index.get_method(version_id):
395
new_content = self.factory.parse_fulltext(data, version_idx)
396
if parent is not None:
397
reference_content = self._get_content(parent)
398
old_texts = reference_content.text()
401
new_texts = new_content.text()
402
delta_seq = SequenceMatcher(None, old_texts, new_texts)
403
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
405
delta = self.factory.parse_line_delta(data, version_idx)
406
return parent, sha1, noeol, delta
325
408
def get_graph_with_ghosts(self):
326
409
"""See VersionedFile.get_graph_with_ghosts()."""
327
410
graph_items = self._index.get_graph()
357
440
__contains__ = has_version
359
def _merge_annotations(self, content, parents):
442
def _merge_annotations(self, content, parents, parent_texts={},
443
delta=None, annotated=None):
360
444
"""Merge annotations for content. This is done by comparing
361
the annotations based on changed to the text."""
362
for parent_id in parents:
363
merge_content = self._get_content(parent_id)
364
seq = SequenceMatcher(None, merge_content.text(), content.text())
365
for i, j, n in seq.get_matching_blocks():
368
content._lines[j:j+n] = merge_content._lines[i:i+n]
445
the annotations based on changed to the text.
449
for parent_id in parents:
450
merge_content = self._get_content(parent_id, parent_texts)
451
seq = SequenceMatcher(None, merge_content.text(), content.text())
452
if delta_seq is None:
453
# setup a delta seq to reuse.
455
for i, j, n in seq.get_matching_blocks():
458
# this appears to copy (origin, text) pairs across to the new
459
# content for any line that matches the last-checked parent.
460
# FIXME: save the sequence control data for delta compression
461
# against the most relevant parent rather than rediffing.
462
content._lines[j:j+n] = merge_content._lines[i:i+n]
465
reference_content = self._get_content(parents[0], parent_texts)
466
new_texts = content.text()
467
old_texts = reference_content.text()
468
delta_seq = SequenceMatcher(None, old_texts, new_texts)
469
return self._make_line_delta(delta_seq, content)
471
def _make_line_delta(self, delta_seq, new_content):
472
"""Generate a line delta from delta_seq and new_content."""
474
for op in delta_seq.get_opcodes():
477
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
370
480
def _get_components(self, version_id):
371
481
"""Return a list of (version_id, method, data) tuples that
463
578
raise RevisionNotPresent(list(version_ids)[0], self.filename)
465
def _add_lines_with_ghosts(self, version_id, parents, lines):
580
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
466
581
"""See VersionedFile.add_lines_with_ghosts()."""
467
582
self._check_add(version_id, lines)
468
return self._add(version_id, lines[:], parents, self.delta)
583
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
470
def _add_lines(self, version_id, parents, lines):
585
def _add_lines(self, version_id, parents, lines, parent_texts):
471
586
"""See VersionedFile.add_lines."""
472
587
self._check_add(version_id, lines)
473
588
self._check_versions_present(parents)
474
return self._add(version_id, lines[:], parents, self.delta)
589
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
476
591
def _check_add(self, version_id, lines):
477
592
"""check that version_id and lines are safe to add."""
720
848
HEADER = "# bzr knit index 7\n"
850
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
851
# __slots__ = ['_cache', '_history', '_transport', '_filename']
722
853
def _cache_version(self, version_id, options, pos, size, parents):
723
val = (version_id, options, pos, size, parents)
724
self._cache[version_id] = val
725
if not version_id in self._history:
854
"""Cache a version record in the history array and index cache.
856
This is inlined into __init__ for performance. KEEP IN SYNC.
857
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
860
# only want the _history index to reference the 1st index entry
862
if version_id not in self._cache:
726
863
self._history.append(version_id)
728
def _iter_index(self, fp):
734
#for l in lines.splitlines(False):
864
self._cache[version_id] = (version_id, options, pos, size, parents)
737
866
def __init__(self, transport, filename, mode, create=False):
738
867
_KnitComponentFile.__init__(self, transport, filename, mode)
750
879
pb.update('read knit index', count, total)
751
880
fp = self._transport.get(self._filename)
752
881
self.check_header(fp)
753
for rec in self._iter_index(fp):
882
# readlines reads the whole file at once:
883
# bad for transports like http, good for local disk
884
# we save 60 ms doing this one change (
885
# from calling readline each time to calling
887
# probably what we want for nice behaviour on
888
# http is a incremental readlines that yields, or
889
# a check for local vs non local indexes,
890
for l in fp.readlines():
756
pb.update('read knit index', count, total)
757
parents = self._parse_parents(rec[4:])
758
self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),
894
#pb.update('read knit index', count, total)
895
# See self._parse_parents
897
for value in rec[4:]:
899
# uncompressed reference
900
parents.append(value[1:])
902
# this is 15/4000ms faster than isinstance,
904
# this function is called thousands of times a
905
# second so small variations add up.
906
assert value.__class__ is str
907
parents.append(self._history[int(value)])
908
# end self._parse_parents
909
# self._cache_version(rec[0],
914
# --- self._cache_version
915
# only want the _history index to reference the 1st
916
# index entry for version_id
918
if version_id not in self._cache:
919
self._history.append(version_id)
920
self._cache[version_id] = (version_id,
925
# --- self._cache_version
760
926
except NoSuchFile, e:
761
927
if mode != 'w' or not create:
1179
1357
self.target.fix_parents(version, new_parents)
1186
1363
InterVersionedFile.register_optimiser(InterKnit)
1366
# make GzipFile faster:
1368
class GzipFile(gzip.GzipFile):
1369
"""Knit tuned version of GzipFile.
1371
This is based on the following lsprof stats:
1372
python 2.4 stock GzipFile write:
1373
58971 0 5644.3090 2721.4730 gzip:193(write)
1374
+58971 0 1159.5530 1159.5530 +<built-in method compress>
1375
+176913 0 987.0320 987.0320 +<len>
1376
+58971 0 423.1450 423.1450 +<zlib.crc32>
1377
+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.
1379
tuned GzipFile write:
1380
58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)
1381
+58971 0 1297.7620 1297.7620 +<built-in method compress>
1382
+58971 0 406.2160 406.2160 +<zlib.crc32>
1383
+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.
1385
+58971 0 328.2670 328.2670 +<len>
1388
Yes, its only 1.6 seconds, but they add up.
1391
def write(self, data):
1392
if self.mode != gzip.WRITE:
1394
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
1396
if self.fileobj is None:
1397
raise ValueError, "write() on closed GzipFile object"
1398
data_len = len(data)
1400
self.size = self.size + data_len
1401
self.crc = zlib.crc32(data, self.crc)
1402
self.fileobj.write( self.compress.compress(data) )
1403
self.offset += data_len
1405
def writelines(self, lines):
1406
# profiling indicated a significant overhead
1407
# calling write for each line.
1408
# this batch call is a lot faster :).
1409
# (4 seconds to 1 seconds for the sample upgrades I was testing).
1410
self.write(''.join(lines))
1413
class SequenceMatcher(difflib.SequenceMatcher):
1414
"""Knit tuned sequence matcher.
1416
This is based on profiling of difflib which indicated some improvements
1417
for our usage pattern.
1420
def find_longest_match(self, alo, ahi, blo, bhi):
1421
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1423
If isjunk is not defined:
1425
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1426
alo <= i <= i+k <= ahi
1427
blo <= j <= j+k <= bhi
1428
and for all (i',j',k') meeting those conditions,
1431
and if i == i', j <= j'
1433
In other words, of all maximal matching blocks, return one that
1434
starts earliest in a, and of all those maximal matching blocks that
1435
start earliest in a, return the one that starts earliest in b.
1437
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1438
>>> s.find_longest_match(0, 5, 0, 9)
1441
If isjunk is defined, first the longest matching block is
1442
determined as above, but with the additional restriction that no
1443
junk element appears in the block. Then that block is extended as
1444
far as possible by matching (only) junk elements on both sides. So
1445
the resulting block never matches on junk except as identical junk
1446
happens to be adjacent to an "interesting" match.
1448
Here's the same example as before, but considering blanks to be
1449
junk. That prevents " abcd" from matching the " abcd" at the tail
1450
end of the second sequence directly. Instead only the "abcd" can
1451
match, and matches the leftmost "abcd" in the second sequence:
1453
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1454
>>> s.find_longest_match(0, 5, 0, 9)
1457
If no blocks match, return (alo, blo, 0).
1459
>>> s = SequenceMatcher(None, "ab", "c")
1460
>>> s.find_longest_match(0, 2, 0, 1)
1464
# CAUTION: stripping common prefix or suffix would be incorrect.
1468
# Longest matching block is "ab", but if common prefix is
1469
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1470
# strip, so ends up claiming that ab is changed to acab by
1471
# inserting "ca" in the middle. That's minimal but unintuitive:
1472
# "it's obvious" that someone inserted "ac" at the front.
1473
# Windiff ends up at the same place as diff, but by pairing up
1474
# the unique 'b's and then matching the first two 'a's.
1476
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1477
besti, bestj, bestsize = alo, blo, 0
1478
# find longest junk-free match
1479
# during an iteration of the loop, j2len[j] = length of longest
1480
# junk-free match ending with a[i-1] and b[j]
1484
for i in xrange(alo, ahi):
1485
# look at all instances of a[i] in b; note that because
1486
# b2j has no junk keys, the loop is skipped if a[i] is junk
1487
j2lenget = j2len.get
1490
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1491
# following improvement
1492
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1493
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1494
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1496
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1497
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1498
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1510
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1512
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1515
# Extend the best by non-junk elements on each end. In particular,
1516
# "popular" non-junk elements aren't in b2j, which greatly speeds
1517
# the inner loop above, but also means "the best" match so far
1518
# doesn't contain any junk *or* popular non-junk elements.
1519
while besti > alo and bestj > blo and \
1520
not isbjunk(b[bestj-1]) and \
1521
a[besti-1] == b[bestj-1]:
1522
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1523
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1524
not isbjunk(b[bestj+bestsize]) and \
1525
a[besti+bestsize] == b[bestj+bestsize]:
1528
# Now that we have a wholly interesting match (albeit possibly
1529
# empty!), we may as well suck up the matching junk on each
1530
# side of it too. Can't think of a good reason not to, and it
1531
# saves post-processing the (possibly considerable) expense of
1532
# figuring out what to do with it. In the case of an empty
1533
# interesting match, this is clearly the right thing to do,
1534
# because no other kind of match is possible in the regions.
1535
while besti > alo and bestj > blo and \
1536
isbjunk(b[bestj-1]) and \
1537
a[besti-1] == b[bestj-1]:
1538
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1539
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1540
isbjunk(b[bestj+bestsize]) and \
1541
a[besti+bestsize] == b[bestj+bestsize]:
1542
bestsize = bestsize + 1
1544
return besti, bestj, bestsize