~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: abentley
Date: 2006-04-20 23:47:53 UTC
mfrom: (1681 +trunk)
mto: This revision was merged to the branch mainline in revision 1683.
Revision ID: abentley@lappy-20060420234753-6a6874b76f09f86d

Merge bzr.dev

files added:
bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/textfile.py

bzrlib/textmerge.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/http

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/tuned_gzip.py

bzrlib/weave_commands.py

tools/doc_generate/autodoc_rstx.py

files renamed:
bzrlib/tests/test_uncommit.py => bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/test_basis_inventory.py => bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/transport/http.py => bzrlib/transport/http/__init__.py

files modified:
BRANCH.TODO

HACKING

NEWS

README

bzrlib/__init__.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/rio.py

bzrlib/status.py

bzrlib/store/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/text.py

bzrlib/store/versioned/__init__.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_store.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml5.py

setup.py

tools/doc_generate/autodoc_man.py

tutorial.txt

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from difflib import SequenceMatcher

from gzip import GzipFile

from itertools import izip

from itertools import izip, chain

import os

import sys

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import *

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

116

115

"""Return a list of (origin, text) tuples."""

117

116

return list(self.annotate_iter())

118

117

119

def apply_delta(self, delta):

120

"""Apply delta to this content."""

121

offset = 0

122

for start, end, count, lines in delta:

123

self._lines[offset+start:offset+end] = lines

124

offset = offset + (start - end) + count

125

126

118

def line_delta_iter(self, new_lines):

127

"""Generate line-based delta from new_lines to this content."""

119

"""Generate line-based delta from this content to new_lines."""

128

120

new_texts = [text for origin, text in new_lines._lines]

129

121

old_texts = [text for origin, text in self._lines]

130

s = difflib.SequenceMatcher(None, old_texts, new_texts)

122

s = SequenceMatcher(None, old_texts, new_texts)

131

123

for op in s.get_opcodes():

132

124

if op[0] == 'equal':

133

125

continue

126

# ofrom oto length data

134

127

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

135

128

136

129

def line_delta(self, new_lines):

168

161

return KnitContent(lines)

169

162

170

163

def parse_line_delta_iter(self, lines):

164

for result_item in self.parse_line_delta[lines]:

165

yield result_item

166

167

def parse_line_delta(self, lines, version):

171

168

"""Convert a line based delta into internal representation.

172

169

173

170

line delta is in the form of:

177

174

internal represnetation is

178

175

(start, end, count, [1..count tuples (revid, newline)])

179

176

"""

180

while lines:

181

header = lines.pop(0)

182

start, end, c = [int(n) for n in header.split(',')]

177

result = []

178

lines = iter(lines)

179

next = lines.next

180

# walk through the lines parsing.

181

for header in lines:

182

start, end, count = [int(n) for n in header.split(',')]

183

contents = []

184

for i in range(c):

185

origin, text = lines.pop(0).split(' ', 1)

184

remaining = count

185

while remaining:

186

origin, text = next().split(' ', 1)

187

remaining -= 1

186

188

contents.append((origin.decode('utf-8'), text))

187

yield start, end, c, contents

188

189

def parse_line_delta(self, lines, version):

190

return list(self.parse_line_delta_iter(lines))

189

result.append((start, end, count, contents))

190

return result

191

192

def lower_fulltext(self, content):

193

"""convert a fulltext content record into a serializable form.

199

def lower_line_delta(self, delta):

200

"""convert a delta into a serializable form.

201

202

See parse_line_delta_iter which this inverts.

202

See parse_line_delta which this inverts.

203

"""

204

out = []

205

for start, end, c, lines in delta:

285

self.delta = delta

286

287

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

288

access_mode, create=create)

288

access_mode, create=create, file_mode=file_mode)

289

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

290

access_mode, create=not len(self.versions()))

290

access_mode, create=create and not len(self), file_mode=file_mode)

291

292

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

293

"""See VersionedFile._add_delta()."""

294

self._check_add(version_id, []) # should we check the lines ?

295

self._check_versions_present(parents)

296

present_parents = []

297

ghosts = []

298

parent_texts = {}

299

for parent in parents:

300

if not self.has_version(parent):

301

ghosts.append(parent)

302

else:

303

present_parents.append(parent)

304

305

if delta_parent is None:

306

# reconstitute as full text.

307

assert len(delta) == 1 or len(delta) == 0

308

if len(delta):

309

assert delta[0][0] == 0

310

assert delta[0][1] == 0, delta[0][1]

311

return super(KnitVersionedFile, self)._add_delta(version_id,

312

parents,

313

delta_parent,

314

sha1,

315

noeol,

316

delta)

317

318

digest = sha1

319

320

options = []

321

if noeol:

322

options.append('no-eol')

323

324

if delta_parent is not None:

325

# determine the current delta chain length.

326

# To speed the extract of texts the delta chain is limited

327

# to a fixed number of deltas. This should minimize both

328

# I/O and the time spend applying deltas.

329

count = 0

330

delta_parents = [delta_parent]

331

while count < 25:

332

parent = delta_parents[0]

333

method = self._index.get_method(parent)

334

if method == 'fulltext':

335

break

336

delta_parents = self._index.get_parents(parent)

337

count = count + 1

338

if method == 'line-delta':

339

# did not find a fulltext in the delta limit.

340

# just do a normal insertion.

341

return super(KnitVersionedFile, self)._add_delta(version_id,

342

parents,

343

delta_parent,

344

sha1,

345

noeol,

346

delta)

347

348

options.append('line-delta')

349

store_lines = self.factory.lower_line_delta(delta)

350

351

where, size = self._data.add_record(version_id, digest, store_lines)

352

self._index.add_version(version_id, options, where, size, parents)

291

353

292

354

def clear_cache(self):

293

355

"""Clear the data cache only."""

297

359

"""See VersionedFile.copy_to()."""

298

360

# copy the current index to a temp index to avoid racing with local

299

361

# writes

300

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))

362

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)

301

363

# copy the data file

302

364

transport.put(name + DATA_SUFFIX, self._data._open_file())

303

365

# rename the copied index into place

322

384

current_values[3],

323

385

new_parents)

324

386

387

def get_delta(self, version_id):

388

"""Get a delta for constructing version from some other version."""

389

if not self.has_version(version_id):

390

raise RevisionNotPresent(version_id, self.filename)

391

392

parents = self.get_parents(version_id)

393

if len(parents):

394

parent = parents[0]

395

else:

396

parent = None

397

data_pos, data_size = self._index.get_position(version_id)

398

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

399

version_idx = self._index.lookup(version_id)

400

noeol = 'no-eol' in self._index.get_options(version_id)

401

if 'fulltext' == self._index.get_method(version_id):

402

new_content = self.factory.parse_fulltext(data, version_idx)

403

if parent is not None:

404

reference_content = self._get_content(parent)

405

old_texts = reference_content.text()

406

else:

407

old_texts = []

408

new_texts = new_content.text()

409

delta_seq = SequenceMatcher(None, old_texts, new_texts)

410

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

411

else:

412

delta = self.factory.parse_line_delta(data, version_idx)

413

return parent, sha1, noeol, delta

414

325

415

def get_graph_with_ghosts(self):

326

416

"""See VersionedFile.get_graph_with_ghosts()."""

327

417

graph_items = self._index.get_graph()

328

418

return dict(graph_items)

329

419

420

def get_sha1(self, version_id):

421

"""See VersionedFile.get_sha1()."""

422

components = self._get_components(version_id)

423

return components[-1][-1][-1]

424

330

425

@staticmethod

331

426

def get_suffixes():

332

427

"""See VersionedFile.get_suffixes()."""

356

451

357

452

__contains__ = has_version

358

453

359

def _merge_annotations(self, content, parents):

454

def _merge_annotations(self, content, parents, parent_texts={},

455

delta=None, annotated=None):

360

456

"""Merge annotations for content. This is done by comparing

361

the annotations based on changed to the text."""

362

for parent_id in parents:

363

merge_content = self._get_content(parent_id)

364

seq = SequenceMatcher(None, merge_content.text(), content.text())

365

for i, j, n in seq.get_matching_blocks():

366

if n == 0:

367

continue

368

content._lines[j:j+n] = merge_content._lines[i:i+n]

457

the annotations based on changed to the text.

458

"""

459

if annotated:

460

delta_seq = None

461

for parent_id in parents:

462

merge_content = self._get_content(parent_id, parent_texts)

463

seq = SequenceMatcher(None, merge_content.text(), content.text())

464

if delta_seq is None:

465

# setup a delta seq to reuse.

466

delta_seq = seq

467

for i, j, n in seq.get_matching_blocks():

468

if n == 0:

469

continue

470

# this appears to copy (origin, text) pairs across to the new

471

# content for any line that matches the last-checked parent.

472

# FIXME: save the sequence control data for delta compression

473

# against the most relevant parent rather than rediffing.

474

content._lines[j:j+n] = merge_content._lines[i:i+n]

475

if delta:

476

if not annotated:

477

reference_content = self._get_content(parents[0], parent_texts)

478

new_texts = content.text()

479

old_texts = reference_content.text()

480

delta_seq = SequenceMatcher(None, old_texts, new_texts)

481

return self._make_line_delta(delta_seq, content)

482

483

def _make_line_delta(self, delta_seq, new_content):

484

"""Generate a line delta from delta_seq and new_content."""

485

diff_hunks = []

486

for op in delta_seq.get_opcodes():

487

if op[0] == 'equal':

488

continue

489

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

490

return diff_hunks

369

491

370

492

def _get_components(self, version_id):

371

493

"""Return a list of (version_id, method, data) tuples that

377

499

The basis knit will be used to the largest extent possible

378

500

since it is assumed that accesses to it is faster.

379

501

"""

502

#profile notes:

503

# 4168 calls in 14912, 2289 internal

504

# 4168 in 9711 to read_records

505

# 52554 in 1250 to get_parents

506

# 170166 in 865 to list.append

507

380

508

# needed_revisions holds a list of (method, version_id) of

381

509

# versions that is needed to be fetched to construct the final

382

510

# version of the file.

424

552

425

553

return out

426

554

427

def _get_content(self, version_id):

555

def _get_content(self, version_id, parent_texts={}):

428

556

"""Returns a content object that makes up the specified

429

557

version."""

430

558

if not self.has_version(version_id):

431

559

raise RevisionNotPresent(version_id, self.filename)

432

560

561

cached_version = parent_texts.get(version_id, None)

562

if cached_version is not None:

563

return cached_version

564

433

565

if self.basis_knit and version_id in self.basis_knit:

434

566

return self.basis_knit._get_content(version_id)

435

567

442

574

content = self.factory.parse_fulltext(data, version_idx)

443

575

elif method == 'line-delta':

444

576

delta = self.factory.parse_line_delta(data, version_idx)

445

content.apply_delta(delta)

577

content._lines = self._apply_delta(content._lines, delta)

446

578

447

579

if 'no-eol' in self._index.get_options(version_id):

448

580

line = content._lines[-1][1].rstrip('\n')

449

581

content._lines[-1] = (content._lines[-1][0], line)

450

582

583

# digest here is the digest from the last applied component.

451

584

if sha_strings(content.text()) != digest:

452

raise KnitCorrupt(self.filename, 'sha-1 does not match')

585

import pdb;pdb.set_trace()

586

raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)

453

587

454

588

return content

455

589

462

596

if version_ids:

463

597

raise RevisionNotPresent(list(version_ids)[0], self.filename)

464

598

465

def _add_lines_with_ghosts(self, version_id, parents, lines):

599

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

466

600

"""See VersionedFile.add_lines_with_ghosts()."""

467

601

self._check_add(version_id, lines)

468

return self._add(version_id, lines[:], parents, self.delta)

602

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

469

603

470

def _add_lines(self, version_id, parents, lines):

604

def _add_lines(self, version_id, parents, lines, parent_texts):

471

605

"""See VersionedFile.add_lines."""

472

606

self._check_add(version_id, lines)

473

607

self._check_versions_present(parents)

474

return self._add(version_id, lines[:], parents, self.delta)

608

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

475

609

476

610

def _check_add(self, version_id, lines):

477

611

"""check that version_id and lines are safe to add."""

481

615

raise InvalidRevisionId(version_id)

482

616

if self.has_version(version_id):

483

617

raise RevisionAlreadyPresent(version_id, self.filename)

484

485

if False or __debug__:

486

for l in lines:

487

assert '\n' not in l[:-1]

488

489

def _add(self, version_id, lines, parents, delta):

618

self._check_lines_not_unicode(lines)

619

self._check_lines_are_lines(lines)

620

621

def _add(self, version_id, lines, parents, delta, parent_texts):

490

622

"""Add a set of lines on top of version specified by parents.

491

623

492

624

If delta is true, compress the text as a line-delta against

494

626

495

627

Any versions not present will be converted into ghosts.

496

628

"""

629

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

630

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

631

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

632

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

633

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

634

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

635

# +1383 0 8.0370 8.0370 +<len>

636

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

637

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

638

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

639

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

640

497

641

present_parents = []

498

642

ghosts = []

643

if parent_texts is None:

644

parent_texts = {}

499

645

for parent in parents:

500

646

if not self.has_version(parent):

501

647

ghosts.append(parent)

512

658

options.append('no-eol')

513

659

lines[-1] = lines[-1] + '\n'

514

660

515

lines = self.factory.make(lines, version_id)

516

if self.factory.annotated and len(present_parents) > 0:

517

# Merge annotations from parent texts if so is needed.

518

self._merge_annotations(lines, present_parents)

519

520

661

if len(present_parents) and delta:

521

662

# To speed the extract of texts the delta chain is limited

522

663

# to a fixed number of deltas. This should minimize both

533

674

if method == 'line-delta':

534

675

delta = False

535

676

677

lines = self.factory.make(lines, version_id)

678

if delta or (self.factory.annotated and len(present_parents) > 0):

679

# Merge annotations from parent texts if so is needed.

680

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

681

delta, self.factory.annotated)

682

536

683

if delta:

537

684

options.append('line-delta')

538

content = self._get_content(present_parents[0])

539

delta_hunks = content.line_delta(lines)

540

685

store_lines = self.factory.lower_line_delta(delta_hunks)

541

686

else:

542

687

options.append('fulltext')

544

689

545

690

where, size = self._data.add_record(version_id, digest, store_lines)

546

691

self._index.add_version(version_id, options, where, size, parents)

692

return lines

547

693

548

694

def check(self, progress_bar=None):

549

695

"""See VersionedFile.check()."""

620

766

621

767

def get_parents(self, version_id):

622

768

"""See VersionedFile.get_parents."""

623

self._check_versions_present([version_id])

624

return list(self._index.get_parents(version_id))

769

# perf notes:

770

# optimism counts!

771

# 52554 calls in 1264 872 internal down from 3674

772

try:

773

return self._index.get_parents(version_id)

774

except KeyError:

775

raise RevisionNotPresent(version_id, self.filename)

625

776

626

777

def get_parents_with_ghosts(self, version_id):

627

778

"""See VersionedFile.get_parents."""

628

self._check_versions_present([version_id])

629

return list(self._index.get_parents_with_ghosts(version_id))

779

try:

780

return self._index.get_parents_with_ghosts(version_id)

781

except KeyError:

782

raise RevisionNotPresent(version_id, self.filename)

630

783

631

784

def get_ancestry(self, versions):

632

785

"""See VersionedFile.get_ancestry."""

667

820

for lineno, insert_id, dset, line in w.walk(version_ids):

668

821

yield lineno, insert_id, dset, line

669

822

823

def plan_merge(self, ver_a, ver_b):

824

"""See VersionedFile.plan_merge."""

825

ancestors_b = set(self.get_ancestry(ver_b))

826

def status_a(revision, text):

827

if revision in ancestors_b:

828

return 'killed-b', text

829

else:

830

return 'new-a', text

831

832

ancestors_a = set(self.get_ancestry(ver_a))

833

def status_b(revision, text):

834

if revision in ancestors_a:

835

return 'killed-a', text

836

else:

837

return 'new-b', text

838

839

annotated_a = self.annotate(ver_a)

840

annotated_b = self.annotate(ver_b)

841

plain_a = [t for (a, t) in annotated_a]

842

plain_b = [t for (a, t) in annotated_b]

843

blocks = SequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

844

a_cur = 0

845

b_cur = 0

846

for ai, bi, l in blocks:

847

# process all mismatched sections

848

# (last mismatched section is handled because blocks always

849

# includes a 0-length last block)

850

for revision, text in annotated_a[a_cur:ai]:

851

yield status_a(revision, text)

852

for revision, text in annotated_b[b_cur:bi]:

853

yield status_b(revision, text)

854

855

# and now the matched section

856

a_cur = ai + l

857

b_cur = bi + l

858

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

859

assert text_a == text_b

860

yield "unchanged", text_a

861

670

862

671

863

class _KnitComponentFile(object):

672

864

"""One of the files used to implement a knit database"""

673

865

674

def __init__(self, transport, filename, mode):

866

def __init__(self, transport, filename, mode, file_mode=None):

675

867

self._transport = transport

676

868

self._filename = filename

677

869

self._mode = mode

870

self._file_mode=file_mode

678

871

679

872

def write_header(self):

680

old_len = self._transport.append(self._filename, StringIO(self.HEADER))

681

if old_len != 0:

873

if self._transport.append(self._filename, StringIO(self.HEADER),

874

mode=self._file_mode):

682

875

raise KnitCorrupt(self._filename, 'misaligned after writing header')

683

876

684

877

def check_header(self, fp):

685

line = fp.read(len(self.HEADER))

878

line = fp.readline()

686

879

if line != self.HEADER:

687

880

raise KnitHeaderError(badline=line)

688

881

715

908

this allows updates to correct version and parent information.

716

909

Note that the two entries may share the delta, and that successive

717

910

annotations and references MUST point to the first entry.

911

912

The index file on disc contains a header, followed by one line per knit

913

record. The same revision can be present in an index file more than once.

914

The first occurence gets assigned a sequence number starting from 0.

915

916

The format of a single line is

917

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

918

REVISION_ID is a utf8-encoded revision id

919

FLAGS is a comma separated list of flags about the record. Values include

920

no-eol, line-delta, fulltext.

921

BYTE_OFFSET is the ascii representation of the byte offset in the data file

922

that the the compressed data starts at.

923

LENGTH is the ascii representation of the length of the data file.

924

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

925

REVISION_ID.

926

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

927

revision id already in the knit that is a parent of REVISION_ID.

928

The ' :' marker is the end of record marker.

929

930

partial writes:

931

when a write is interrupted to the index file, it will result in a line that

932

does not end in ' :'. If the ' :' is not present at the end of a line, or at

933

the end of the file, then the record that is missing it will be ignored by

934

the parser.

935

936

When writing new records to the index file, the data is preceeded by '\n'

937

to ensure that records always start on new lines even if the last write was

938

interrupted. As a result its normal for the last line in the index to be

939

missing a trailing newline. One can be added with no harmful effects.

718

940

"""

719

941

720

HEADER = "# bzr knit index 7\n"

942

HEADER = "# bzr knit index 8\n"

943

944

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

945

# __slots__ = ['_cache', '_history', '_transport', '_filename']

721

946

722

947

def _cache_version(self, version_id, options, pos, size, parents):

723

val = (version_id, options, pos, size, parents)

724

self._cache[version_id] = val

725

if not version_id in self._history:

948

"""Cache a version record in the history array and index cache.

949

950

This is inlined into __init__ for performance. KEEP IN SYNC.

951

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

952

indexes).

953

"""

954

# only want the _history index to reference the 1st index entry

955

# for version_id

956

if version_id not in self._cache:

957

index = len(self._history)

726

958

self._history.append(version_id)

727

728

def _iter_index(self, fp):

729

l = fp.readline()

730

while l != '':

731

yield l.split()

732

l = fp.readline()

733

#lines = fp.read()

734

#for l in lines.splitlines(False):

735

# yield l.split()

736

737

def __init__(self, transport, filename, mode, create=False):

738

_KnitComponentFile.__init__(self, transport, filename, mode)

959

else:

960

index = self._cache[version_id][5]

961

self._cache[version_id] = (version_id,

962

options,

963

pos,

964

size,

965

parents,

966

index)

967

968

def __init__(self, transport, filename, mode, create=False, file_mode=None):

969

_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)

739

970

self._cache = {}

740

971

# position in _history is the 'official' index for a revision

741

972

# but the values may have come from a newer entry.

750

981

pb.update('read knit index', count, total)

751

982

fp = self._transport.get(self._filename)

752

983

self.check_header(fp)

753

for rec in self._iter_index(fp):

984

# readlines reads the whole file at once:

985

# bad for transports like http, good for local disk

986

# we save 60 ms doing this one change (

987

# from calling readline each time to calling

988

# readlines once.

989

# probably what we want for nice behaviour on

990

# http is a incremental readlines that yields, or

991

# a check for local vs non local indexes,

992

for l in fp.readlines():

993

rec = l.split()

994

if len(rec) < 5 or rec[-1] != ':':

995

# corrupt line.

996

# FIXME: in the future we should determine if its a

997

# short write - and ignore it

998

# or a different failure, and raise. RBC 20060407

999

continue

754

1000

count += 1

755

1001

total += 1

756

pb.update('read knit index', count, total)

757

parents = self._parse_parents(rec[4:])

758

self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),

759

parents)

1002

#pb.update('read knit index', count, total)

1003

# See self._parse_parents

1004

parents = []

1005

for value in rec[4:-1]:

1006

if '.' == value[0]:

1007

# uncompressed reference

1008

parents.append(value[1:])

1009

else:

1010

# this is 15/4000ms faster than isinstance,

1011

# (in lsprof)

1012

# this function is called thousands of times a

1013

# second so small variations add up.

1014

assert value.__class__ is str

1015

parents.append(self._history[int(value)])

1016

# end self._parse_parents

1017

# self._cache_version(rec[0],

1018

# rec[1].split(','),

1019

# int(rec[2]),

1020

# int(rec[3]),

1021

# parents)

1022

# --- self._cache_version

1023

# only want the _history index to reference the 1st

1024

# index entry for version_id

1025

version_id = rec[0]

1026

if version_id not in self._cache:

1027

index = len(self._history)

1028

self._history.append(version_id)

1029

else:

1030

index = self._cache[version_id][5]

1031

self._cache[version_id] = (version_id,

1032

rec[1].split(','),

1033

int(rec[2]),

1034

int(rec[3]),

1035

parents,

1036

index)

1037

# --- self._cache_version

760

1038

except NoSuchFile, e:

761

1039

if mode != 'w' or not create:

762

1040

raise

770

1048

771

1049

ints are looked up in the index.

772

1050

.FOO values are ghosts and converted in to FOO.

1051

1052

NOTE: the function is retained here for clarity, and for possible

1053

use in partial index reads. However bulk processing now has

1054

it inlined in __init__ for inner-loop optimisation.

773

1055

"""

774

1056

result = []

775

1057

for value in compressed_parents:

776

if value.startswith('.'):

1058

if value[-1] == '.':

1059

# uncompressed reference

777

1060

result.append(value[1:])

778

1061

else:

779

assert isinstance(value, str)

1062

# this is 15/4000ms faster than isinstance,

1063

# this function is called thousands of times a

1064

# second so small variations add up.

1065

assert value.__class__ is str

780

1066

result.append(self._history[int(value)])

781

1067

return result

782

1068

838

1124

839

1125

def lookup(self, version_id):

840

1126

assert version_id in self._cache

841

return self._history.index(version_id)

1127

return self._cache[version_id][5]

842

1128

843

1129

def _version_list_to_index(self, versions):

844

1130

result_list = []

845

1131

for version in versions:

846

1132

if version in self._cache:

847

result_list.append(str(self._history.index(version)))

1133

# -- inlined lookup() --

1134

result_list.append(str(self._cache[version][5]))

1135

# -- end lookup () --

848

1136

else:

849

1137

result_list.append('.' + version.encode('utf-8'))

850

1138

return ' '.join(result_list)

853

1141

"""Add a version record to the index."""

854

1142

self._cache_version(version_id, options, pos, size, parents)

855

1143

856

content = "%s %s %s %s %s\n" % (version_id.encode('utf-8'),

1144

content = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),

857

1145

','.join(options),

858

1146

pos,

859

1147

size,

904

1192

class _KnitData(_KnitComponentFile):

905

1193

"""Contents of the knit data file"""

906

1194

907

HEADER = "# bzr knit data 7\n"

1195

HEADER = "# bzr knit data 8\n"

908

1196

909

def __init__(self, transport, filename, mode, create=False):

1197

def __init__(self, transport, filename, mode, create=False, file_mode=None):

910

1198

_KnitComponentFile.__init__(self, transport, filename, mode)

911

1199

self._file = None

912

1200

self._checked = False

913

1201

if create:

914

self._transport.put(self._filename, StringIO(''))

1202

self._transport.put(self._filename, StringIO(''), mode=file_mode)

915

1203

self._records = {}

916

1204

917

1205

def clear_cache(self):

933

1221

"""

934

1222

sio = StringIO()

935

1223

data_file = GzipFile(None, mode='wb', fileobj=sio)

936

print >>data_file, "version %s %d %s" % (version_id.encode('utf-8'), len(lines), digest)

937

data_file.writelines(lines)

938

print >>data_file, "end %s\n" % version_id.encode('utf-8')

1224

data_file.writelines(chain(

1225

["version %s %d %s\n" % (version_id.encode('utf-8'),

1226

len(lines),

1227

digest)],

1228

lines,

1229

["end %s\n" % version_id.encode('utf-8')]))

939

1230

data_file.close()

940

1231

length= sio.tell()

1232

941

1233

sio.seek(0)

942

1234

return length, sio

943

1235

974

1266

return df, rec

975

1267

976

1268

def _parse_record(self, version_id, data):

1269

# profiling notes:

1270

# 4168 calls in 2880 217 internal

1271

# 4168 calls to _parse_record_header in 2121

1272

# 4168 calls to readlines in 330

977

1273

df, rec = self._parse_record_header(version_id, data)

978

lines = int(rec[2])

979

record_contents = self._read_record_contents(df, lines)

980

l = df.readline()

1274

record_contents = df.readlines()

1275

l = record_contents.pop()

1276

assert len(record_contents) == int(rec[2])

981

1277

if l.decode('utf-8') != 'end %s\n' % version_id:

982

1278

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

983

1279

% (l, version_id))

984

1280

df.close()

985

1281

return record_contents, rec[3]

986

1282

987

def _read_record_contents(self, df, record_lines):

988

"""Read and return n lines from datafile."""

989

r = []

990

for i in range(record_lines):

991

r.append(df.readline())

992

return r

993

994

1283

def read_records_iter_raw(self, records):

995

1284

"""Read text records from data file and yield raw data.

996

1285

1034

1323

will be read in the given order. Yields (version_id,

1035

1324

contents, digest).

1036

1325

"""

1326

# profiling notes:

1327

# 60890 calls for 4168 extractions in 5045, 683 internal.

1328

# 4168 calls to readv in 1411

1329

# 4168 calls to parse_record in 2880

1037

1330

1038

1331

needed_records = []

1039

1332

for version_id, pos, size in records:

1051

1344

self._records[record_id] = (digest, content)

1052

1345

1053

1346

for version_id, pos, size in records:

1054

yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])

1347

yield version_id, list(self._records[version_id][1]), self._records[version_id][0]

1055

1348

1056

1349

def read_records(self, records):

1057

1350

"""Read records into a dictionary."""

1179

1472

self.target.fix_parents(version, new_parents)

1180

1473

return count

1181

1474

finally:

1182

pb.clear()

1183

1475

pb.finished()

1184

1476

1185

1477

1186

1478

InterVersionedFile.register_optimiser(InterKnit)

1479

1480

1481

class SequenceMatcher(difflib.SequenceMatcher):

1482

"""Knit tuned sequence matcher.

1483

1484

This is based on profiling of difflib which indicated some improvements

1485

for our usage pattern.

1486

"""

1487

1488

def find_longest_match(self, alo, ahi, blo, bhi):

1489

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1490

1491

If isjunk is not defined:

1492

1493

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1494

alo <= i <= i+k <= ahi

1495

blo <= j <= j+k <= bhi

1496

and for all (i',j',k') meeting those conditions,

1497

k >= k'

1498

i <= i'

1499

and if i == i', j <= j'

1500

1501

In other words, of all maximal matching blocks, return one that

1502

starts earliest in a, and of all those maximal matching blocks that

1503

start earliest in a, return the one that starts earliest in b.

1504

1505

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1506

>>> s.find_longest_match(0, 5, 0, 9)

1507

(0, 4, 5)

1508

1509

If isjunk is defined, first the longest matching block is

1510

determined as above, but with the additional restriction that no

1511

junk element appears in the block. Then that block is extended as

1512

far as possible by matching (only) junk elements on both sides. So

1513

the resulting block never matches on junk except as identical junk

1514

happens to be adjacent to an "interesting" match.

1515

1516

Here's the same example as before, but considering blanks to be

1517

junk. That prevents " abcd" from matching the " abcd" at the tail

1518

end of the second sequence directly. Instead only the "abcd" can

1519

match, and matches the leftmost "abcd" in the second sequence:

1520

1521

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1522

>>> s.find_longest_match(0, 5, 0, 9)

1523

(1, 0, 4)

1524

1525

If no blocks match, return (alo, blo, 0).

1526

1527

>>> s = SequenceMatcher(None, "ab", "c")

1528

>>> s.find_longest_match(0, 2, 0, 1)

1529

(0, 0, 0)

1530

"""

1531

1532

# CAUTION: stripping common prefix or suffix would be incorrect.

1533

# E.g.,

1534

# ab

1535

# acab

1536

# Longest matching block is "ab", but if common prefix is

1537

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1538

# strip, so ends up claiming that ab is changed to acab by

1539

# inserting "ca" in the middle. That's minimal but unintuitive:

1540

# "it's obvious" that someone inserted "ac" at the front.

1541

# Windiff ends up at the same place as diff, but by pairing up

1542

# the unique 'b's and then matching the first two 'a's.

1543

1544

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1545

besti, bestj, bestsize = alo, blo, 0

1546

# find longest junk-free match

1547

# during an iteration of the loop, j2len[j] = length of longest

1548

# junk-free match ending with a[i-1] and b[j]

1549

j2len = {}

1550

# nothing = []

1551

b2jget = b2j.get

1552

for i in xrange(alo, ahi):

1553

# look at all instances of a[i] in b; note that because

1554

# b2j has no junk keys, the loop is skipped if a[i] is junk

1555

j2lenget = j2len.get

1556

newj2len = {}

1557

1558

# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the

1559

# following improvement

1560

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1561

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1562

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1563

# to

1564

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1565

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1566

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1567

1568

try:

1569

js = b2j[a[i]]

1570

except KeyError:

1571

pass

1572

else:

1573

for j in js:

1574

# a[i] matches b[j]

1575

if j >= blo:

1576

if j >= bhi:

1577

break

1578

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1579

if k > bestsize:

1580

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1581

j2len = newj2len

1582

1583

# Extend the best by non-junk elements on each end. In particular,

1584

# "popular" non-junk elements aren't in b2j, which greatly speeds

1585

# the inner loop above, but also means "the best" match so far

1586

# doesn't contain any junk *or* popular non-junk elements.

1587

while besti > alo and bestj > blo and \

1588

not isbjunk(b[bestj-1]) and \

1589

a[besti-1] == b[bestj-1]:

1590

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1591

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1592

not isbjunk(b[bestj+bestsize]) and \

1593

a[besti+bestsize] == b[bestj+bestsize]:

1594

bestsize += 1

1595

1596

# Now that we have a wholly interesting match (albeit possibly

1597

# empty!), we may as well suck up the matching junk on each

1598

# side of it too. Can't think of a good reason not to, and it

1599

# saves post-processing the (possibly considerable) expense of

1600

# figuring out what to do with it. In the case of an empty

1601

# interesting match, this is clearly the right thing to do,

1602

# because no other kind of match is possible in the regions.

1603

while besti > alo and bestj > blo and \

1604

isbjunk(b[bestj-1]) and \

1605

a[besti-1] == b[bestj-1]:

1606

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1607

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1608

isbjunk(b[bestj+bestsize]) and \

1609

a[besti+bestsize] == b[bestj+bestsize]:

1610

bestsize = bestsize + 1

1611

1612

return besti, bestj, bestsize

1613

Older »