~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Martin Pool
Date: 2007-09-04 09:10:35 UTC
mto: This revision was merged to the branch mainline in revision 2798.
Revision ID: mbp@sourcefrog.net-20070904091035-d11e7tfk55hy0a2z

merge cpatiencediff from Lukas

files added:
bzrlib/_patiencediff_c.c

bzrlib/patiencediff.py

files renamed:
bzrlib/patiencediff.py => bzrlib/_patiencediff_py.py

files modified:
NEWS

bzrlib/knit.py

bzrlib/tests/test_diff.py

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

149

148

"""Generate line-based delta from this content to new_lines."""

150

149

new_texts = new_lines.text()

151

150

old_texts = self.text()

152

s = KnitSequenceMatcher(None, old_texts, new_texts)

151

s = patiencediff.PatienceSequenceMatcher(None, old_texts, new_texts)

153

152

for tag, i1, i2, j1, j2 in s.get_opcodes():

154

153

if tag == 'equal':

155

154

continue

661

660

else:

662

661

old_texts = []

663

662

new_texts = new_content.text()

664

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

663

delta_seq = patiencediff.PatienceSequenceMatcher(None, old_texts,

664

new_texts)

665

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

666

else:

667

delta = self.factory.parse_line_delta(data, version_id)

2394

InterVersionedFile.register_optimiser(WeaveToKnit)

2395

2396

2397

class KnitSequenceMatcher(difflib.SequenceMatcher):

2398

"""Knit tuned sequence matcher.

2399

2400

This is based on profiling of difflib which indicated some improvements

2401

for our usage pattern.

2402

"""

2403

2404

def find_longest_match(self, alo, ahi, blo, bhi):

2405

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

2406

2407

If isjunk is not defined:

2408

2409

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

2410

alo <= i <= i+k <= ahi

2411

blo <= j <= j+k <= bhi

2412

and for all (i',j',k') meeting those conditions,

2413

k >= k'

2414

i <= i'

2415

and if i == i', j <= j'

2416

2417

In other words, of all maximal matching blocks, return one that

2418

starts earliest in a, and of all those maximal matching blocks that

2419

start earliest in a, return the one that starts earliest in b.

2420

2421

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

2422

>>> s.find_longest_match(0, 5, 0, 9)

2423

(0, 4, 5)

2424

2425

If isjunk is defined, first the longest matching block is

2426

determined as above, but with the additional restriction that no

2427

junk element appears in the block. Then that block is extended as

2428

far as possible by matching (only) junk elements on both sides. So

2429

the resulting block never matches on junk except as identical junk

2430

happens to be adjacent to an "interesting" match.

2431

2432

Here's the same example as before, but considering blanks to be

2433

junk. That prevents " abcd" from matching the " abcd" at the tail

2434

end of the second sequence directly. Instead only the "abcd" can

2435

match, and matches the leftmost "abcd" in the second sequence:

2436

2437

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

2438

>>> s.find_longest_match(0, 5, 0, 9)

2439

(1, 0, 4)

2440

2441

If no blocks match, return (alo, blo, 0).

2442

2443

>>> s = SequenceMatcher(None, "ab", "c")

2444

>>> s.find_longest_match(0, 2, 0, 1)

2445

(0, 0, 0)

2446

"""

2447

2448

# CAUTION: stripping common prefix or suffix would be incorrect.

2449

# E.g.,

2450

# ab

2451

# acab

2452

# Longest matching block is "ab", but if common prefix is

2453

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

2454

# strip, so ends up claiming that ab is changed to acab by

2455

# inserting "ca" in the middle. That's minimal but unintuitive:

2456

# "it's obvious" that someone inserted "ac" at the front.

2457

# Windiff ends up at the same place as diff, but by pairing up

2458

# the unique 'b's and then matching the first two 'a's.

2459

2460

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

2461

besti, bestj, bestsize = alo, blo, 0

2462

# find longest junk-free match

2463

# during an iteration of the loop, j2len[j] = length of longest

2464

# junk-free match ending with a[i-1] and b[j]

2465

j2len = {}

2466

# nothing = []

2467

b2jget = b2j.get

2468

for i in xrange(alo, ahi):

2469

# look at all instances of a[i] in b; note that because

2470

# b2j has no junk keys, the loop is skipped if a[i] is junk

2471

j2lenget = j2len.get

2472

newj2len = {}

2473

2474

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

2475

# following improvement

2476

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

2477

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

2478

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

2479

# to

2480

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

2481

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

2482

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

2483

2484

try:

2485

js = b2j[a[i]]

2486

except KeyError:

2487

pass

2488

else:

2489

for j in js:

2490

# a[i] matches b[j]

2491

if j >= blo:

2492

if j >= bhi:

2493

break

2494

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

2495

if k > bestsize:

2496

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

2497

j2len = newj2len

2498

2499

# Extend the best by non-junk elements on each end. In particular,

2500

# "popular" non-junk elements aren't in b2j, which greatly speeds

2501

# the inner loop above, but also means "the best" match so far

2502

# doesn't contain any junk *or* popular non-junk elements.

2503

while besti > alo and bestj > blo and \

2504

not isbjunk(b[bestj-1]) and \

2505

a[besti-1] == b[bestj-1]:

2506

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2507

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2508

not isbjunk(b[bestj+bestsize]) and \

2509

a[besti+bestsize] == b[bestj+bestsize]:

2510

bestsize += 1

2511

2512

# Now that we have a wholly interesting match (albeit possibly

2513

# empty!), we may as well suck up the matching junk on each

2514

# side of it too. Can't think of a good reason not to, and it

2515

# saves post-processing the (possibly considerable) expense of

2516

# figuring out what to do with it. In the case of an empty

2517

# interesting match, this is clearly the right thing to do,

2518

# because no other kind of match is possible in the regions.

2519

while besti > alo and bestj > blo and \

2520

isbjunk(b[bestj-1]) and \

2521

a[besti-1] == b[bestj-1]:

2522

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2523

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2524

isbjunk(b[bestj+bestsize]) and \

2525

a[besti+bestsize] == b[bestj+bestsize]:

2526

bestsize = bestsize + 1

2527

2528

return besti, bestj, bestsize

2397

# Deprecated, use PatienceSequenceMatcher instead

2398

KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher

2529

2399

2530

2400

2531

2401

def annotate_knit(knit, revision_id):

Older »