~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/weave.py

Committer: Martin Pool
Date: 2005-07-11 07:05:34 UTC
Revision ID: mbp@sourcefrog.net-20050711070534-5227696ab167ccde

- merge aaron's append_multiple.patch

files added:
plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

files removed:
HACKING

bzrlib/delta.py

bzrlib/fetch.py

bzrlib/intset.py

bzrlib/missing.py

bzrlib/plugins/__init__.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/teststatus.py

bzrlib/util

bzrlib/util/__init__.py

contrib/emacs

contrib/emacs/bzr-mode.el

doc/split-join-files.txt

notes/revfile.txt

patches/pending-merge.patch

tools/history2revfiles.py

tools/weavemerge.sh

tutorial.txt

files renamed:
bzrlib/util/effbot/ => effbot/

bzrlib/util/elementtree/ => elementtree/

bzrlib/plugins/ => plugins/

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/store.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml.py

setup.py

testsweet.py

tools/convertfile.py

tools/testweave.py

tools/weavebench.py

Show diffs side-by-side

added added

removed removed

bzrlib/weave.py

"""Weave - storage of related text file versions"""

# before intset (r923) 2000 versions in 41.5s

# with intset (r926) 2000 versions in 93s !!!

# better to just use plain sets.

# making _extract build and return a list, rather than being a generator

# takes 37.94s

# with python -O, r923 does 2000 versions in 36.87s

# with optimizations to avoid mutating lists - 35.75! I guess copying

# all the elements every time costs more than the small manipulations.

# a surprisingly small change.

# r931, which avoids using a generator for extract, does 36.98s

# with memoized inclusions, takes 41.49s; not very good

# with slots, takes 37.35s; without takes 39.16, a bit surprising

# with the delta calculation mixed in with the add method, rather than

# separated, takes 36.78s

# with delta folded in and mutation of the list, 36.13s

# with all this and simplification of add code, 33s

# TODO: Perhaps have copy method for Weave instances?

# XXX: If we do weaves this way, will a merge still behave the same

# way if it's done in a different order? That's a pretty desirable

# property.

# TODO: How to write these to disk? One option is cPickle, which

# would be fast but less friendly to C, and perhaps not portable. Another is

# TODO: Nothing here so far assumes the lines are really \n newlines,

# rather than being split up in some other way. We could accomodate

# binaries, perhaps by naively splitting on \n or perhaps using

# something like a rolling checksum.

# TODO: Perhaps track SHA-1 in the header for protection? This would

# be redundant with it being stored in the inventory, but perhaps

# usefully so?

# TODO: Track version names as well as indexes.

# TODO: Probably do transitive expansion when specifying parents?

# TODO: Separate out some code to read and write weaves.

# TODO: End marker for each version so we can stop reading?

# TODO: Check that no insertion occurs inside a deletion that was

# active in the version of the insertion.

# TODO: In addition to the SHA-1 check, perhaps have some code that

# checks structural constraints of the weave: ie that insertions are

# properly nested, that there is no text outside of an insertion, that

# insertions or deletions are not repeated, etc.

# TODO: Parallel-extract that passes back each line along with a

# description of which revisions include it. Nice for checking all

# shas in parallel.

# TODO: Perhaps a special slower check() method that verifies more

# nesting constraints and the MD5 of each version?

try:

set

frozenset

except NameError:

from sets import Set, ImmutableSet

set = Set

frozenset = ImmutableSet

del Set, ImmutableSet

class WeaveError(Exception):

107

the version-id is used to reference it in the larger world.

108

109

The weave is represented as a list mixing edit instructions and

110

literal text. Each entry in _weave can be either a string (or

literal text. Each entry in _l can be either a string (or

111

unicode), or a tuple. If a string, it means that the given line

112

should be output in the currently active revisions.

113

100

151

138

should be no way to get an earlier version deleting a later

152

139

version.

153

140

154

_weave

155

Text of the weave; list of control instruction tuples and strings.

141

142

Text of the weave.

156

143

157

_parents

144

158

145

List of parents, indexed by version number.

159

146

It is only necessary to store the minimal set of parents for

160

147

each version; the parent's parents are implied.

162

149

_sha1s

163

150

List of hex SHA-1 of each version, or None if not recorded.

164

151

"""

165

166

__slots__ = ['_weave', '_parents', '_sha1s']

167

168

152

def __init__(self):

169

self._weave = []

170

self._parents = []

153

self._l = []

154

self._v = []

171

155

self._sha1s = []

172

156

173

157

174

158

def __eq__(self, other):

175

159

if not isinstance(other, Weave):

176

160

return False

177

return self._parents == other._parents \

178

and self._weave == other._weave

161

return self._v == other._v \

162

and self._l == other._l

179

163

180

164

181

165

def __ne__(self, other):

192

176

193

177

text

194

178

Sequence of lines to be added in the new version."""

195

196

self._check_versions(parents)

179

## self._check_versions(parents)

197

180

## self._check_lines(text)

198

new_version = len(self._parents)

181

idx = len(self._v)

199

182

200

183

import sha

201

184

s = sha.new()

202

map(s.update, text)

185

for l in text:

186

s.update(l)

203

187

sha1 = s.hexdigest()

204

188

del s

205

189

206

# if we abort after here the weave will be corrupt

207

self._parents.append(frozenset(parents))

190

if parents:

191

ancestors = self.inclusions(parents)

192

delta = self._delta(ancestors, text)

193

194

# offset gives the number of lines that have been inserted

195

# into the weave up to the current point; if the original edit instruction

196

# says to change line A then we actually change (A+offset)

197

offset = 0

198

199

for i1, i2, newlines in delta:

200

assert 0 <= i1

201

assert i1 <= i2

202

assert i2 <= len(self._l)

203

204

# the deletion and insertion are handled separately.

205

# first delete the region.

206

if i1 != i2:

207

self._l.insert(i1+offset, ('[', idx))

208

self._l.insert(i2+offset+1, (']', idx))

209

offset += 2

210

# is this OK???

211

212

if newlines:

213

# there may have been a deletion spanning up to

214

# i2; we want to insert after this region to make sure

215

# we don't destroy ourselves

216

i = i2 + offset

217

self._l[i:i] = [('{', idx)] \

218

+ newlines \

219

+ [('}', idx)]

220

offset += 2 + len(newlines)

221

222

self._addversion(parents)

223

else:

224

# special case; adding with no parents revision; can do this

225

# more quickly by just appending unconditionally

226

self._l.append(('{', idx))

227

self._l += text

228

self._l.append(('}', idx))

229

230

self._addversion(None)

231

208

232

self._sha1s.append(sha1)

209

210

211

if not parents:

212

# special case; adding with no parents revision; can do

213

# this more quickly by just appending unconditionally.

214

# even more specially, if we're adding an empty text we

215

# need do nothing at all.

216

if text:

217

self._weave.append(('{', new_version))

218

self._weave.extend(text)

219

self._weave.append(('}', new_version))

220

221

return new_version

222

223

if len(parents) == 1:

224

pv = list(parents)[0]

225

if sha1 == self._sha1s[pv]:

226

# special case: same as the single parent

227

return new_version

228

229

230

ancestors = self.inclusions(parents)

231

232

l = self._weave

233

234

# basis a list of (origin, lineno, line)

235

basis_lineno = []

236

basis_lines = []

237

for origin, lineno, line in self._extract(ancestors):

238

basis_lineno.append(lineno)

239

basis_lines.append(line)

240

241

# another small special case: a merge, producing the same text

242

# as auto-merge

243

if text == basis_lines:

244

return new_version

245

246

# add a sentinal, because we can also match against the final line

247

basis_lineno.append(len(self._weave))

248

249

# XXX: which line of the weave should we really consider

250

# matches the end of the file? the current code says it's the

251

# last line of the weave?

252

253

#print 'basis_lines:', basis_lines

254

#print 'new_lines: ', lines

255

256

from difflib import SequenceMatcher

257

s = SequenceMatcher(None, basis_lines, text)

258

259

# offset gives the number of lines that have been inserted

260

# into the weave up to the current point; if the original edit instruction

261

# says to change line A then we actually change (A+offset)

262

offset = 0

263

264

for tag, i1, i2, j1, j2 in s.get_opcodes():

265

# i1,i2 are given in offsets within basis_lines; we need to map them

266

# back to offsets within the entire weave

267

#print 'raw match', tag, i1, i2, j1, j2

268

if tag == 'equal':

269

continue

270

271

i1 = basis_lineno[i1]

272

i2 = basis_lineno[i2]

273

274

assert 0 <= j1 <= j2 <= len(text)

275

276

#print tag, i1, i2, j1, j2

277

278

# the deletion and insertion are handled separately.

279

# first delete the region.

280

if i1 != i2:

281

self._weave.insert(i1+offset, ('[', new_version))

282

self._weave.insert(i2+offset+1, (']', new_version))

283

offset += 2

284

285

if j1 != j2:

286

# there may have been a deletion spanning up to

287

# i2; we want to insert after this region to make sure

288

# we don't destroy ourselves

289

i = i2 + offset

290

self._weave[i:i] = ([('{', new_version)]

291

+ text[j1:j2]

292

+ [('}', new_version)])

293

offset += 2 + (j2 - j1)

294

295

return new_version

233

234

return idx

296

235

297

236

298

237

def inclusions(self, versions):

303

242

while v >= 0:

304

243

if v in i:

305

244

# include all its parents

306

i.update(self._parents[v])

245

i.update(self._v[v])

307

246

v -= 1

308

247

return i

309

248

except IndexError:

312

251

313

252

def minimal_parents(self, version):

314

253

"""Find the minimal set of parents for the version."""

315

included = self._parents[version]

254

included = self._v[version]

316

255

if not included:

317

256

return []

318

257

332

271

return mininc

333

272

334

273

274

def _addversion(self, parents):

275

if parents:

276

self._v.append(parents)

277

else:

278

self._v.append(frozenset())

279

335

280

336

281

def _check_lines(self, text):

337

282

if not isinstance(text, list):

348

293

"""Check everything in the sequence of indexes is valid"""

349

294

for i in indexes:

350

295

try:

351

self._parents[i]

296

self._v[i]

352

297

except IndexError:

353

298

raise IndexError("invalid version number %r" % i)

354

299

365

310

yield origin, text

366

311

367

312

368

def _walk(self):

369

"""Walk the weave.

370

371

Yields sequence of

372

(lineno, insert, deletes, text)

373

for each literal line.

374

"""

375

376

istack = []

377

dset = set()

378

379

lineno = 0 # line of weave, 0-based

380

381

for l in self._weave:

382

if isinstance(l, tuple):

383

c, v = l

384

isactive = None

385

if c == '{':

386

istack.append(v)

387

elif c == '}':

388

oldv = istack.pop()

389

elif c == '[':

390

assert v not in dset

391

dset.add(v)

392

elif c == ']':

393

dset.remove(v)

394

else:

395

raise WeaveFormatError('unexpected instruction %r'

396

% v)

397

else:

398

assert isinstance(l, basestring)

399

assert istack

400

yield lineno, istack[-1], dset, l

401

lineno += 1

402

403

404

405

313

def _extract(self, versions):

406

314

"""Yield annotation of lines in included set.

407

315

420

328

421

329

isactive = None

422

330

423

result = []

424

425

331

WFE = WeaveFormatError

426

332

427

for l in self._weave:

333

for l in self._l:

428

334

if isinstance(l, tuple):

429

335

c, v = l

430

336

isactive = None

448

354

if isactive is None:

449

355

isactive = (not dset) and istack and (istack[-1] in included)

450

356

if isactive:

451

result.append((istack[-1], lineno, l))

357

yield istack[-1], lineno, l

452

358

lineno += 1

453

359

454

360

if istack:

458

364

raise WFE("unclosed deletion blocks at end of weave",

459

365

dset)

460

366

461

return result

462

463

464

367

465

368

def get_iter(self, version):

466

369

"""Yield lines for the specified version."""

474

377

475

378

def mash_iter(self, included):

476

379

"""Return composed version of multiple included versions."""

380

included = frozenset(included)

477

381

for origin, lineno, text in self._extract(included):

478

382

yield text

479

383

480

384

481

385

def dump(self, to_file):

482

386

from pprint import pprint

483

print >>to_file, "Weave._weave = ",

484

pprint(self._weave, to_file)

485

print >>to_file, "Weave._parents = ",

486

pprint(self._parents, to_file)

387

print >>to_file, "Weave._l = ",

388

pprint(self._l, to_file)

389

print >>to_file, "Weave._v = ",

390

pprint(self._v, to_file)

487

391

488

392

489

393

490

394

def numversions(self):

491

l = len(self._parents)

395

l = len(self._v)

492

396

assert l == len(self._sha1s)

493

397

return l

494

398

495

399

496

def __len__(self):

497

return self.numversions()

498

499

500

400

def check(self, progress_bar=None):

501

401

# check no circular inclusions

502

402

for version in range(self.numversions()):

503

inclusions = list(self._parents[version])

403

inclusions = list(self._v[version])

504

404

if inclusions:

505

405

inclusions.sort()

506

406

if inclusions[-1] >= version:

566

466

If line1=line2, this is a pure insert; if newlines=[] this is a

567

467

pure delete. (Similar to difflib.)

568

468

"""

569

570

571

572

def plan_merge(self, ver_a, ver_b):

573

"""Return pseudo-annotation indicating how the two versions merge.

574

575

This is computed between versions a and b and their common

576

base.

577

578

Weave lines present in none of them are skipped entirely.

579

"""

580

inc_a = self.inclusions([ver_a])

581

inc_b = self.inclusions([ver_b])

582

inc_c = inc_a & inc_b

583

584

for lineno, insert, deleteset, line in self._walk():

585

if deleteset & inc_c:

586

# killed in parent; can't be in either a or b

587

# not relevant to our work

588

yield 'killed-base', line

589

elif insert in inc_c:

590

# was inserted in base

591

killed_a = bool(deleteset & inc_a)

592

killed_b = bool(deleteset & inc_b)

593

if killed_a and killed_b:

594

yield 'killed-both', line

595

elif killed_a:

596

yield 'killed-a', line

597

elif killed_b:

598

yield 'killed-b', line

599

else:

600

yield 'unchanged', line

601

elif insert in inc_a:

602

if deleteset & inc_a:

603

yield 'ghost-a', line

604

else:

605

# new in A; not in B

606

yield 'new-a', line

607

elif insert in inc_b:

608

if deleteset & inc_b:

609

yield 'ghost-b', line

610

else:

611

yield 'new-b', line

612

else:

613

# not in either revision

614

yield 'irrelevant', line

615

616

yield 'unchanged', '' # terminator

617

618

619

620

def weave_merge(self, plan):

621

lines_a = []

622

lines_b = []

623

ch_a = ch_b = False

624

625

for state, line in plan:

626

if state == 'unchanged' or state == 'killed-both':

627

# resync and flush queued conflicts changes if any

628

if not lines_a and not lines_b:

629

pass

630

elif ch_a and not ch_b:

631

# one-sided change:

632

for l in lines_a: yield l

633

elif ch_b and not ch_a:

634

for l in lines_b: yield l

635

elif lines_a == lines_b:

636

for l in lines_a: yield l

637

else:

638

yield '<<<<\n'

639

for l in lines_a: yield l

640

yield '====\n'

641

for l in lines_b: yield l

642

yield '>>>>\n'

643

644

del lines_a[:]

645

del lines_b[:]

646

ch_a = ch_b = False

647

648

if state == 'unchanged':

649

if line:

650

yield line

651

elif state == 'killed-a':

652

ch_a = True

653

lines_b.append(line)

654

elif state == 'killed-b':

655

ch_b = True

656

lines_a.append(line)

657

elif state == 'new-a':

658

ch_a = True

659

lines_a.append(line)

660

elif state == 'new-b':

661

ch_b = True

662

lines_b.append(line)

663

else:

664

assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',

665

'killed-both'), \

666

state

667

668

669

670

671

672

673

674

def weave_info(w):

469

# basis a list of (origin, lineno, line)

470

basis_lineno = []

471

basis_lines = []

472

for origin, lineno, line in self._extract(included):

473

basis_lineno.append(lineno)

474

basis_lines.append(line)

475

476

# add a sentinal, because we can also match against the final line

477

basis_lineno.append(len(self._l))

478

479

# XXX: which line of the weave should we really consider

480

# matches the end of the file? the current code says it's the

481

# last line of the weave?

482

483

from difflib import SequenceMatcher

484

s = SequenceMatcher(None, basis_lines, lines)

485

486

# TODO: Perhaps return line numbers from composed weave as well?

487

488

for tag, i1, i2, j1, j2 in s.get_opcodes():

489

##print tag, i1, i2, j1, j2

490

491

if tag == 'equal':

492

continue

493

494

# i1,i2 are given in offsets within basis_lines; we need to map them

495

# back to offsets within the entire weave

496

real_i1 = basis_lineno[i1]

497

real_i2 = basis_lineno[i2]

498

499

assert 0 <= j1

500

assert j1 <= j2

501

assert j2 <= len(lines)

502

503

yield real_i1, real_i2, lines[j1:j2]

504

505

506

507

def weave_info(filename, out):

675

508

"""Show some text information about the weave."""

676

print '%6s %40s %20s' % ('ver', 'sha1', 'parents')

677

for i in (6, 40, 20):

678

print '-' * i,

679

680

for i in range(w.numversions()):

681

sha1 = w._sha1s[i]

682

print '%6d %40s %s' % (i, sha1, ' '.join(map(str, w._parents[i])))

683

684

685

686

def weave_stats(weave_file):

687

from bzrlib.progress import ProgressBar

688

from bzrlib.weavefile import read_weave

689

690

pb = ProgressBar()

691

692

wf = file(weave_file, 'rb')

509

from weavefile import read_weave

510

wf = file(filename, 'rb')

693

511

w = read_weave(wf)

694

512

# FIXME: doesn't work on pipes

695

513

weave_size = wf.tell()

514

print >>out, "weave file size %d bytes" % weave_size

515

print >>out, "weave contains %d versions" % len(w._v)

696

516

697

517

total = 0

698

vers = len(w)

699

for i in range(vers):

700

pb.update('checking sizes', i, vers)

701

for line in w.get_iter(i):

702

total += len(line)

703

704

pb.clear()

705

706

print 'versions %9d' % vers

707

print 'weave file %9d bytes' % weave_size

708

print 'total contents %9d bytes' % total

709

print 'compression ratio %9.2fx' % (float(total) / float(weave_size))

710

if vers:

711

avg = total/vers

712

print 'average size %9d bytes' % avg

713

print 'relative size %9.2fx' % (float(weave_size) / float(avg))

518

print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')

519

for i in (6, 6, 8, 40, 20):

520

print '-' * i,

521

522

for i in range(len(w._v)):

523

text = w.get(i)

524

lines = len(text)

525

bytes = sum((len(a) for a in text))

526

sha1 = w._sha1s[i]

527

print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),

528

for pv in w._v[i]:

529

print pv,

530

531

total += bytes

532

533

print >>out, "versions total %d bytes" % total

534

print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))

714

535

715

536

716

537

def usage():

814

635

lasto = origin

815

636

816

637

elif cmd == 'info':

817

weave_info(readit())

818

819

elif cmd == 'stats':

820

weave_stats(argv[2])

638

weave_info(argv[2], sys.stdout)

821

639

822

640

elif cmd == 'check':

823

641

w = readit()

824

642

pb = ProgressBar()

825

643

w.check(pb)

826

644

pb.clear()

827

print '%d versions ok' % w.numversions()

828

645

829

646

elif cmd == 'inclusions':

830

647

w = readit()

832

649

833

650

elif cmd == 'parents':

834

651

w = readit()

835

print ' '.join(map(str, w._parents[int(argv[3])]))

836

837

elif cmd == 'plan-merge':

838

w = readit()

839

for state, line in w.plan_merge(int(argv[3]), int(argv[4])):

840

if line:

841

print '%14s | %s' % (state, line),

652

print ' '.join(map(str, w._v[int(argv[3])]))

842

653

843

654

elif cmd == 'merge':

844

w = readit()

845

p = w.plan_merge(int(argv[3]), int(argv[4]))

846

sys.stdout.writelines(w.weave_merge(p))

847

848

elif cmd == 'mash-merge':

849

655

if len(argv) != 5:

850

656

usage()

851

657

return 1

Older »