2394
2394
InterVersionedFile.register_optimiser(WeaveToKnit)
2397
class KnitSequenceMatcher(difflib.SequenceMatcher):
2398
"""Knit tuned sequence matcher.
2400
This is based on profiling of difflib which indicated some improvements
2401
for our usage pattern.
2404
def find_longest_match(self, alo, ahi, blo, bhi):
2405
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2407
If isjunk is not defined:
2409
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
2410
alo <= i <= i+k <= ahi
2411
blo <= j <= j+k <= bhi
2412
and for all (i',j',k') meeting those conditions,
2415
and if i == i', j <= j'
2417
In other words, of all maximal matching blocks, return one that
2418
starts earliest in a, and of all those maximal matching blocks that
2419
start earliest in a, return the one that starts earliest in b.
2421
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
2422
>>> s.find_longest_match(0, 5, 0, 9)
2425
If isjunk is defined, first the longest matching block is
2426
determined as above, but with the additional restriction that no
2427
junk element appears in the block. Then that block is extended as
2428
far as possible by matching (only) junk elements on both sides. So
2429
the resulting block never matches on junk except as identical junk
2430
happens to be adjacent to an "interesting" match.
2432
Here's the same example as before, but considering blanks to be
2433
junk. That prevents " abcd" from matching the " abcd" at the tail
2434
end of the second sequence directly. Instead only the "abcd" can
2435
match, and matches the leftmost "abcd" in the second sequence:
2437
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
2438
>>> s.find_longest_match(0, 5, 0, 9)
2441
If no blocks match, return (alo, blo, 0).
2443
>>> s = SequenceMatcher(None, "ab", "c")
2444
>>> s.find_longest_match(0, 2, 0, 1)
2448
# CAUTION: stripping common prefix or suffix would be incorrect.
2452
# Longest matching block is "ab", but if common prefix is
2453
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
2454
# strip, so ends up claiming that ab is changed to acab by
2455
# inserting "ca" in the middle. That's minimal but unintuitive:
2456
# "it's obvious" that someone inserted "ac" at the front.
2457
# Windiff ends up at the same place as diff, but by pairing up
2458
# the unique 'b's and then matching the first two 'a's.
2460
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
2461
besti, bestj, bestsize = alo, blo, 0
2462
# find longest junk-free match
2463
# during an iteration of the loop, j2len[j] = length of longest
2464
# junk-free match ending with a[i-1] and b[j]
2468
for i in xrange(alo, ahi):
2469
# look at all instances of a[i] in b; note that because
2470
# b2j has no junk keys, the loop is skipped if a[i] is junk
2471
j2lenget = j2len.get
2474
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
2475
# following improvement
2476
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
2477
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
2478
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
2480
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
2481
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
2482
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
2494
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
2496
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
2499
# Extend the best by non-junk elements on each end. In particular,
2500
# "popular" non-junk elements aren't in b2j, which greatly speeds
2501
# the inner loop above, but also means "the best" match so far
2502
# doesn't contain any junk *or* popular non-junk elements.
2503
while besti > alo and bestj > blo and \
2504
not isbjunk(b[bestj-1]) and \
2505
a[besti-1] == b[bestj-1]:
2506
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2507
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2508
not isbjunk(b[bestj+bestsize]) and \
2509
a[besti+bestsize] == b[bestj+bestsize]:
2512
# Now that we have a wholly interesting match (albeit possibly
2513
# empty!), we may as well suck up the matching junk on each
2514
# side of it too. Can't think of a good reason not to, and it
2515
# saves post-processing the (possibly considerable) expense of
2516
# figuring out what to do with it. In the case of an empty
2517
# interesting match, this is clearly the right thing to do,
2518
# because no other kind of match is possible in the regions.
2519
while besti > alo and bestj > blo and \
2520
isbjunk(b[bestj-1]) and \
2521
a[besti-1] == b[bestj-1]:
2522
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2523
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2524
isbjunk(b[bestj+bestsize]) and \
2525
a[besti+bestsize] == b[bestj+bestsize]:
2526
bestsize = bestsize + 1
2528
return besti, bestj, bestsize
2397
# Deprecated, use PatienceSequenceMatcher instead
2398
KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher
2531
2401
def annotate_knit(knit, revision_id):