~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Aaron Bentley
Date: 2006-04-05 01:02:06 UTC
mfrom: (1558.12.1 bzr.ab.integration)
mto: (2027.1.2 revert-subpath-56549)
mto: This revision was merged to the branch mainline in revision 1647.
Revision ID: aaron.bentley@utoronto.ca-20060405010206-6e9c04f514c1c5e1

Merge bzr.ab.integration

files added:
bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/http

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/weave_commands.py

files removed:
bzrlib/tests/test_reweave.py.moved

files renamed:
bzrlib/transport/http.py => bzrlib/transport/http/__init__.py

files modified:
NEWS

README

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/msgeditor.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_config.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_options.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_uncommit.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_workingtree.py

bzrlib/trace.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from difflib import SequenceMatcher

from gzip import GzipFile

from itertools import izip

import gzip

from itertools import izip, chain

import os

116

115

"""Return a list of (origin, text) tuples."""

117

116

return list(self.annotate_iter())

118

117

119

def apply_delta(self, delta):

120

"""Apply delta to this content."""

121

offset = 0

122

for start, end, count, lines in delta:

123

self._lines[offset+start:offset+end] = lines

124

offset = offset + (start - end) + count

125

126

118

def line_delta_iter(self, new_lines):

127

"""Generate line-based delta from new_lines to this content."""

119

"""Generate line-based delta from this content to new_lines."""

128

120

new_texts = [text for origin, text in new_lines._lines]

129

121

old_texts = [text for origin, text in self._lines]

130

s = difflib.SequenceMatcher(None, old_texts, new_texts)

122

s = SequenceMatcher(None, old_texts, new_texts)

131

123

for op in s.get_opcodes():

132

124

if op[0] == 'equal':

133

125

continue

126

# ofrom oto length data

134

127

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

135

128

136

129

def line_delta(self, new_lines):

289

282

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

290

283

access_mode, create=not len(self.versions()))

291

284

285

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

286

"""See VersionedFile._add_delta()."""

287

self._check_add(version_id, []) # should we check the lines ?

288

self._check_versions_present(parents)

289

present_parents = []

290

ghosts = []

291

parent_texts = {}

292

for parent in parents:

293

if not self.has_version(parent):

294

ghosts.append(parent)

295

else:

296

present_parents.append(parent)

297

298

if delta_parent is None:

299

# reconstitute as full text.

300

assert len(delta) == 1 or len(delta) == 0

301

if len(delta):

302

assert delta[0][0] == 0

303

assert delta[0][1] == 0, delta[0][1]

304

return super(KnitVersionedFile, self)._add_delta(version_id,

305

parents,

306

delta_parent,

307

sha1,

308

noeol,

309

delta)

310

311

digest = sha1

312

313

options = []

314

if noeol:

315

options.append('no-eol')

316

317

if delta_parent is not None:

318

# determine the current delta chain length.

319

# To speed the extract of texts the delta chain is limited

320

# to a fixed number of deltas. This should minimize both

321

# I/O and the time spend applying deltas.

322

count = 0

323

delta_parents = [delta_parent]

324

while count < 25:

325

parent = delta_parents[0]

326

method = self._index.get_method(parent)

327

if method == 'fulltext':

328

break

329

delta_parents = self._index.get_parents(parent)

330

count = count + 1

331

if method == 'line-delta':

332

# did not find a fulltext in the delta limit.

333

# just do a normal insertion.

334

return super(KnitVersionedFile, self)._add_delta(version_id,

335

parents,

336

delta_parent,

337

sha1,

338

noeol,

339

delta)

340

341

options.append('line-delta')

342

store_lines = self.factory.lower_line_delta(delta)

343

344

where, size = self._data.add_record(version_id, digest, store_lines)

345

self._index.add_version(version_id, options, where, size, parents)

346

292

347

def clear_cache(self):

293

348

"""Clear the data cache only."""

294

349

self._data.clear_cache()

322

377

current_values[3],

323

378

new_parents)

324

379

380

def get_delta(self, version_id):

381

"""Get a delta for constructing version from some other version."""

382

if not self.has_version(version_id):

383

raise RevisionNotPresent(version_id, self.filename)

384

385

parents = self.get_parents(version_id)

386

if len(parents):

387

parent = parents[0]

388

else:

389

parent = None

390

data_pos, data_size = self._index.get_position(version_id)

391

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

392

version_idx = self._index.lookup(version_id)

393

noeol = 'no-eol' in self._index.get_options(version_id)

394

if 'fulltext' == self._index.get_method(version_id):

395

new_content = self.factory.parse_fulltext(data, version_idx)

396

if parent is not None:

397

reference_content = self._get_content(parent)

398

old_texts = reference_content.text()

399

else:

400

old_texts = []

401

new_texts = new_content.text()

402

delta_seq = SequenceMatcher(None, old_texts, new_texts)

403

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

404

else:

405

delta = self.factory.parse_line_delta(data, version_idx)

406

return parent, sha1, noeol, delta

407

325

408

def get_graph_with_ghosts(self):

326

409

"""See VersionedFile.get_graph_with_ghosts()."""

327

410

graph_items = self._index.get_graph()

356

439

357

440

__contains__ = has_version

358

441

359

def _merge_annotations(self, content, parents):

442

def _merge_annotations(self, content, parents, parent_texts={},

443

delta=None, annotated=None):

360

444

"""Merge annotations for content. This is done by comparing

361

the annotations based on changed to the text."""

362

for parent_id in parents:

363

merge_content = self._get_content(parent_id)

364

seq = SequenceMatcher(None, merge_content.text(), content.text())

365

for i, j, n in seq.get_matching_blocks():

366

if n == 0:

367

continue

368

content._lines[j:j+n] = merge_content._lines[i:i+n]

445

the annotations based on changed to the text.

446

"""

447

if annotated:

448

delta_seq = None

449

for parent_id in parents:

450

merge_content = self._get_content(parent_id, parent_texts)

451

seq = SequenceMatcher(None, merge_content.text(), content.text())

452

if delta_seq is None:

453

# setup a delta seq to reuse.

454

delta_seq = seq

455

for i, j, n in seq.get_matching_blocks():

456

if n == 0:

457

continue

458

# this appears to copy (origin, text) pairs across to the new

459

# content for any line that matches the last-checked parent.

460

# FIXME: save the sequence control data for delta compression

461

# against the most relevant parent rather than rediffing.

462

content._lines[j:j+n] = merge_content._lines[i:i+n]

463

if delta:

464

if not annotated:

465

reference_content = self._get_content(parents[0], parent_texts)

466

new_texts = content.text()

467

old_texts = reference_content.text()

468

delta_seq = SequenceMatcher(None, old_texts, new_texts)

469

return self._make_line_delta(delta_seq, content)

470

471

def _make_line_delta(self, delta_seq, new_content):

472

"""Generate a line delta from delta_seq and new_content."""

473

diff_hunks = []

474

for op in delta_seq.get_opcodes():

475

if op[0] == 'equal':

476

continue

477

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

478

return diff_hunks

369

479

370

480

def _get_components(self, version_id):

371

481

"""Return a list of (version_id, method, data) tuples that

424

534

425

535

return out

426

536

427

def _get_content(self, version_id):

537

def _get_content(self, version_id, parent_texts={}):

428

538

"""Returns a content object that makes up the specified

429

539

version."""

430

540

if not self.has_version(version_id):

431

541

raise RevisionNotPresent(version_id, self.filename)

432

542

543

cached_version = parent_texts.get(version_id, None)

544

if cached_version is not None:

545

return cached_version

546

433

547

if self.basis_knit and version_id in self.basis_knit:

434

548

return self.basis_knit._get_content(version_id)

435

549

442

556

content = self.factory.parse_fulltext(data, version_idx)

443

557

elif method == 'line-delta':

444

558

delta = self.factory.parse_line_delta(data, version_idx)

445

content.apply_delta(delta)

559

content._lines = self._apply_delta(content._lines, delta)

446

560

447

561

if 'no-eol' in self._index.get_options(version_id):

448

562

line = content._lines[-1][1].rstrip('\n')

449

563

content._lines[-1] = (content._lines[-1][0], line)

450

564

451

565

if sha_strings(content.text()) != digest:

452

raise KnitCorrupt(self.filename, 'sha-1 does not match')

566

import pdb;pdb.set_trace()

567

raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)

453

568

454

569

return content

455

570

462

577

if version_ids:

463

578

raise RevisionNotPresent(list(version_ids)[0], self.filename)

464

579

465

def _add_lines_with_ghosts(self, version_id, parents, lines):

580

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

466

581

"""See VersionedFile.add_lines_with_ghosts()."""

467

582

self._check_add(version_id, lines)

468

return self._add(version_id, lines[:], parents, self.delta)

583

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

469

584

470

def _add_lines(self, version_id, parents, lines):

585

def _add_lines(self, version_id, parents, lines, parent_texts):

471

586

"""See VersionedFile.add_lines."""

472

587

self._check_add(version_id, lines)

473

588

self._check_versions_present(parents)

474

return self._add(version_id, lines[:], parents, self.delta)

589

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

475

590

476

591

def _check_add(self, version_id, lines):

477

592

"""check that version_id and lines are safe to add."""

486

601

for l in lines:

487

602

assert '\n' not in l[:-1]

488

603

489

def _add(self, version_id, lines, parents, delta):

604

def _add(self, version_id, lines, parents, delta, parent_texts):

490

605

"""Add a set of lines on top of version specified by parents.

491

606

492

607

If delta is true, compress the text as a line-delta against

494

609

495

610

Any versions not present will be converted into ghosts.

496

611

"""

612

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

613

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

614

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

615

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

616

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

617

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

618

# +1383 0 8.0370 8.0370 +<len>

619

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

620

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

621

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

622

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

623

497

624

present_parents = []

498

625

ghosts = []

626

if parent_texts is None:

627

parent_texts = {}

499

628

for parent in parents:

500

629

if not self.has_version(parent):

501

630

ghosts.append(parent)

512

641

options.append('no-eol')

513

642

lines[-1] = lines[-1] + '\n'

514

643

515

lines = self.factory.make(lines, version_id)

516

if self.factory.annotated and len(present_parents) > 0:

517

# Merge annotations from parent texts if so is needed.

518

self._merge_annotations(lines, present_parents)

519

520

644

if len(present_parents) and delta:

521

645

# To speed the extract of texts the delta chain is limited

522

646

# to a fixed number of deltas. This should minimize both

533

657

if method == 'line-delta':

534

658

delta = False

535

659

660

lines = self.factory.make(lines, version_id)

661

if delta or (self.factory.annotated and len(present_parents) > 0):

662

# Merge annotations from parent texts if so is needed.

663

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

664

delta, self.factory.annotated)

665

536

666

if delta:

537

667

options.append('line-delta')

538

content = self._get_content(present_parents[0])

539

delta_hunks = content.line_delta(lines)

540

668

store_lines = self.factory.lower_line_delta(delta_hunks)

541

669

else:

542

670

options.append('fulltext')

544

672

545

673

where, size = self._data.add_record(version_id, digest, store_lines)

546

674

self._index.add_version(version_id, options, where, size, parents)

675

return lines

547

676

548

677

def check(self, progress_bar=None):

549

678

"""See VersionedFile.check()."""

677

806

self._mode = mode

678

807

679

808

def write_header(self):

680

old_len = self._transport.append(self._filename, StringIO(self.HEADER))

681

if old_len != 0:

809

if self._transport.append(self._filename, StringIO(self.HEADER)):

682

810

raise KnitCorrupt(self._filename, 'misaligned after writing header')

683

811

684

812

def check_header(self, fp):

719

847

720

848

HEADER = "# bzr knit index 7\n"

721

849

850

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

851

# __slots__ = ['_cache', '_history', '_transport', '_filename']

852

722

853

def _cache_version(self, version_id, options, pos, size, parents):

723

val = (version_id, options, pos, size, parents)

724

self._cache[version_id] = val

725

if not version_id in self._history:

854

"""Cache a version record in the history array and index cache.

855

856

This is inlined into __init__ for performance. KEEP IN SYNC.

857

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

858

indexes).

859

"""

860

# only want the _history index to reference the 1st index entry

861

# for version_id

862

if version_id not in self._cache:

726

863

self._history.append(version_id)

727

728

def _iter_index(self, fp):

729

l = fp.readline()

730

while l != '':

731

yield l.split()

732

l = fp.readline()

733

#lines = fp.read()

734

#for l in lines.splitlines(False):

735

# yield l.split()

864

self._cache[version_id] = (version_id, options, pos, size, parents)

736

865

737

866

def __init__(self, transport, filename, mode, create=False):

738

867

_KnitComponentFile.__init__(self, transport, filename, mode)

750

879

pb.update('read knit index', count, total)

751

880

fp = self._transport.get(self._filename)

752

881

self.check_header(fp)

753

for rec in self._iter_index(fp):

882

# readlines reads the whole file at once:

883

# bad for transports like http, good for local disk

884

# we save 60 ms doing this one change (

885

# from calling readline each time to calling

886

# readlines once.

887

# probably what we want for nice behaviour on

888

# http is a incremental readlines that yields, or

889

# a check for local vs non local indexes,

890

for l in fp.readlines():

891

rec = l.split()

754

892

count += 1

755

893

total += 1

756

pb.update('read knit index', count, total)

757

parents = self._parse_parents(rec[4:])

758

self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),

759

parents)

894

#pb.update('read knit index', count, total)

895

# See self._parse_parents

896

parents = []

897

for value in rec[4:]:

898

if '.' == value[0]:

899

# uncompressed reference

900

parents.append(value[1:])

901

else:

902

# this is 15/4000ms faster than isinstance,

903

# (in lsprof)

904

# this function is called thousands of times a

905

# second so small variations add up.

906

assert value.__class__ is str

907

parents.append(self._history[int(value)])

908

# end self._parse_parents

909

# self._cache_version(rec[0],

910

# rec[1].split(','),

911

# int(rec[2]),

912

# int(rec[3]),

913

# parents)

914

# --- self._cache_version

915

# only want the _history index to reference the 1st

916

# index entry for version_id

917

version_id = rec[0]

918

if version_id not in self._cache:

919

self._history.append(version_id)

920

self._cache[version_id] = (version_id,

921

rec[1].split(','),

922

int(rec[2]),

923

int(rec[3]),

924

parents)

925

# --- self._cache_version

760

926

except NoSuchFile, e:

761

927

if mode != 'w' or not create:

762

928

raise

770

936

771

937

ints are looked up in the index.

772

938

.FOO values are ghosts and converted in to FOO.

939

940

NOTE: the function is retained here for clarity, and for possible

941

use in partial index reads. However bulk processing now has

942

it inlined in __init__ for inner-loop optimisation.

773

943

"""

774

944

result = []

775

945

for value in compressed_parents:

776

if value.startswith('.'):

946

if value[-1] == '.':

947

# uncompressed reference

777

948

result.append(value[1:])

778

949

else:

779

assert isinstance(value, str)

950

# this is 15/4000ms faster than isinstance,

951

# this function is called thousands of times a

952

# second so small variations add up.

953

assert value.__class__ is str

780

954

result.append(self._history[int(value)])

781

955

return result

782

956

933

1107

"""

934

1108

sio = StringIO()

935

1109

data_file = GzipFile(None, mode='wb', fileobj=sio)

936

print >>data_file, "version %s %d %s" % (version_id.encode('utf-8'), len(lines), digest)

937

data_file.writelines(lines)

938

print >>data_file, "end %s\n" % version_id.encode('utf-8')

1110

data_file.writelines(chain(

1111

["version %s %d %s\n" % (version_id.encode('utf-8'),

1112

len(lines),

1113

digest)],

1114

lines,

1115

["end %s\n\n" % version_id.encode('utf-8')]))

939

1116

data_file.close()

940

1117

length= sio.tell()

1118

941

1119

sio.seek(0)

942

1120

return length, sio

943

1121

1179

1357

self.target.fix_parents(version, new_parents)

1180

1358

return count

1181

1359

finally:

1182

pb.clear()

1183

1360

pb.finished()

1184

1361

1185

1362

1186

1363

InterVersionedFile.register_optimiser(InterKnit)

1364

1365

1366

# make GzipFile faster:

1367

import zlib

1368

class GzipFile(gzip.GzipFile):

1369

"""Knit tuned version of GzipFile.

1370

1371

This is based on the following lsprof stats:

1372

python 2.4 stock GzipFile write:

1373

58971 0 5644.3090 2721.4730 gzip:193(write)

1374

+58971 0 1159.5530 1159.5530 +<built-in method compress>

1375

+176913 0 987.0320 987.0320 +<len>

1376

+58971 0 423.1450 423.1450 +<zlib.crc32>

1377

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

1378

StringO' objects>

1379

tuned GzipFile write:

1380

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

1381

+58971 0 1297.7620 1297.7620 +<built-in method compress>

1382

+58971 0 406.2160 406.2160 +<zlib.crc32>

1383

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

1384

StringO' objects>

1385

+58971 0 328.2670 328.2670 +<len>

1386

1387

1388

Yes, its only 1.6 seconds, but they add up.

1389

"""

1390

1391

def write(self, data):

1392

if self.mode != gzip.WRITE:

1393

import errno

1394

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

1395

1396

if self.fileobj is None:

1397

raise ValueError, "write() on closed GzipFile object"

1398

data_len = len(data)

1399

if data_len > 0:

1400

self.size = self.size + data_len

1401

self.crc = zlib.crc32(data, self.crc)

1402

self.fileobj.write( self.compress.compress(data) )

1403

self.offset += data_len

1404

1405

def writelines(self, lines):

1406

# profiling indicated a significant overhead

1407

# calling write for each line.

1408

# this batch call is a lot faster :).

1409

# (4 seconds to 1 seconds for the sample upgrades I was testing).

1410

self.write(''.join(lines))

1411

1412

1413

class SequenceMatcher(difflib.SequenceMatcher):

1414

"""Knit tuned sequence matcher.

1415

1416

This is based on profiling of difflib which indicated some improvements

1417

for our usage pattern.

1418

"""

1419

1420

def find_longest_match(self, alo, ahi, blo, bhi):

1421

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1422

1423

If isjunk is not defined:

1424

1425

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1426

alo <= i <= i+k <= ahi

1427

blo <= j <= j+k <= bhi

1428

and for all (i',j',k') meeting those conditions,

1429

k >= k'

1430

i <= i'

1431

and if i == i', j <= j'

1432

1433

In other words, of all maximal matching blocks, return one that

1434

starts earliest in a, and of all those maximal matching blocks that

1435

start earliest in a, return the one that starts earliest in b.

1436

1437

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1438

>>> s.find_longest_match(0, 5, 0, 9)

1439

(0, 4, 5)

1440

1441

If isjunk is defined, first the longest matching block is

1442

determined as above, but with the additional restriction that no

1443

junk element appears in the block. Then that block is extended as

1444

far as possible by matching (only) junk elements on both sides. So

1445

the resulting block never matches on junk except as identical junk

1446

happens to be adjacent to an "interesting" match.

1447

1448

Here's the same example as before, but considering blanks to be

1449

junk. That prevents " abcd" from matching the " abcd" at the tail

1450

end of the second sequence directly. Instead only the "abcd" can

1451

match, and matches the leftmost "abcd" in the second sequence:

1452

1453

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1454

>>> s.find_longest_match(0, 5, 0, 9)

1455

(1, 0, 4)

1456

1457

If no blocks match, return (alo, blo, 0).

1458

1459

>>> s = SequenceMatcher(None, "ab", "c")

1460

>>> s.find_longest_match(0, 2, 0, 1)

1461

(0, 0, 0)

1462

"""

1463

1464

# CAUTION: stripping common prefix or suffix would be incorrect.

1465

# E.g.,

1466

# ab

1467

# acab

1468

# Longest matching block is "ab", but if common prefix is

1469

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1470

# strip, so ends up claiming that ab is changed to acab by

1471

# inserting "ca" in the middle. That's minimal but unintuitive:

1472

# "it's obvious" that someone inserted "ac" at the front.

1473

# Windiff ends up at the same place as diff, but by pairing up

1474

# the unique 'b's and then matching the first two 'a's.

1475

1476

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1477

besti, bestj, bestsize = alo, blo, 0

1478

# find longest junk-free match

1479

# during an iteration of the loop, j2len[j] = length of longest

1480

# junk-free match ending with a[i-1] and b[j]

1481

j2len = {}

1482

# nothing = []

1483

b2jget = b2j.get

1484

for i in xrange(alo, ahi):

1485

# look at all instances of a[i] in b; note that because

1486

# b2j has no junk keys, the loop is skipped if a[i] is junk

1487

j2lenget = j2len.get

1488

newj2len = {}

1489

1490

# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the

1491

# following improvement

1492

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1493

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1494

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1495

# to

1496

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1497

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1498

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1499

1500

try:

1501

js = b2j[a[i]]

1502

except KeyError:

1503

pass

1504

else:

1505

for j in js:

1506

# a[i] matches b[j]

1507

if j >= blo:

1508

if j >= bhi:

1509

break

1510

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1511

if k > bestsize:

1512

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1513

j2len = newj2len

1514

1515

# Extend the best by non-junk elements on each end. In particular,

1516

# "popular" non-junk elements aren't in b2j, which greatly speeds

1517

# the inner loop above, but also means "the best" match so far

1518

# doesn't contain any junk *or* popular non-junk elements.

1519

while besti > alo and bestj > blo and \

1520

not isbjunk(b[bestj-1]) and \

1521

a[besti-1] == b[bestj-1]:

1522

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1523

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1524

not isbjunk(b[bestj+bestsize]) and \

1525

a[besti+bestsize] == b[bestj+bestsize]:

1526

bestsize += 1

1527

1528

# Now that we have a wholly interesting match (albeit possibly

1529

# empty!), we may as well suck up the matching junk on each

1530

# side of it too. Can't think of a good reason not to, and it

1531

# saves post-processing the (possibly considerable) expense of

1532

# figuring out what to do with it. In the case of an empty

1533

# interesting match, this is clearly the right thing to do,

1534

# because no other kind of match is possible in the regions.

1535

while besti > alo and bestj > blo and \

1536

isbjunk(b[bestj-1]) and \

1537

a[besti-1] == b[bestj-1]:

1538

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1539

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1540

isbjunk(b[bestj+bestsize]) and \

1541

a[besti+bestsize] == b[bestj+bestsize]:

1542

bestsize = bestsize + 1

1543

1544

return besti, bestj, bestsize

1545

Older »