~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/weave.py

Committer: Martin Pool
Date: 2005-07-13 00:30:30 UTC
Revision ID: mbp@sourcefrog.net-20050713003030-2e89871a9ce24c7b

- typo in testsweet

files removed:
bzrlib/intset.py

bzrlib/selftest/testinv.py

tools/weavemerge.sh

files modified:
NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/hashcache.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge_core.py

bzrlib/osutils.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/versioning.py

bzrlib/trace.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

testsweet.py

tools/convertfile.py

tools/testweave.py

tools/weavebench.py

Show diffs side-by-side

added added

removed removed

bzrlib/weave.py

"""Weave - storage of related text file versions"""

# before intset (r923) 2000 versions in 41.5s

# with intset (r926) 2000 versions in 93s !!!

# better to just use plain sets.

# making _extract build and return a list, rather than being a generator

# takes 37.94s

# with python -O, r923 does 2000 versions in 36.87s

# with optimizations to avoid mutating lists - 35.75! I guess copying

# all the elements every time costs more than the small manipulations.

# a surprisingly small change.

# r931, which avoids using a generator for extract, does 36.98s

# with memoized inclusions, takes 41.49s; not very good

# with slots, takes 37.35s; without takes 39.16, a bit surprising

# with the delta calculation mixed in with the add method, rather than

# separated, takes 36.78s

# with delta folded in and mutation of the list, 36.13s

# with all this and simplification of add code, 33s

# TODO: Perhaps have copy method for Weave instances?

# XXX: If we do weaves this way, will a merge still behave the same

# properly nested, that there is no text outside of an insertion, that

# insertions or deletions are not repeated, etc.

# TODO: Parallel-extract that passes back each line along with a

# description of which revisions include it. Nice for checking all

# shas in parallel.

try:

set

frozenset

except NameError:

from sets import Set, ImmutableSet

set = Set

frozenset = ImmutableSet

del Set, ImmutableSet

class WeaveError(Exception):

107

the version-id is used to reference it in the larger world.

108

109

The weave is represented as a list mixing edit instructions and

110

literal text. Each entry in _weave can be either a string (or

literal text. Each entry in _l can be either a string (or

111

unicode), or a tuple. If a string, it means that the given line

112

should be output in the currently active revisions.

113

151

129

should be no way to get an earlier version deleting a later

152

130

version.

153

131

154

_weave

155

Text of the weave; list of control instruction tuples and strings.

132

133

Text of the weave.

156

134

157

_parents

135

158

136

List of parents, indexed by version number.

159

137

It is only necessary to store the minimal set of parents for

160

138

each version; the parent's parents are implied.

162

140

_sha1s

163

141

List of hex SHA-1 of each version, or None if not recorded.

164

142

"""

165

166

__slots__ = ['_weave', '_parents', '_sha1s']

167

168

143

def __init__(self):

169

self._weave = []

170

self._parents = []

144

self._l = []

145

self._v = []

171

146

self._sha1s = []

172

147

173

148

174

149

def __eq__(self, other):

175

150

if not isinstance(other, Weave):

176

151

return False

177

return self._parents == other._parents \

178

and self._weave == other._weave

152

return self._v == other._v \

153

and self._l == other._l

179

154

180

155

181

156

def __ne__(self, other):

192

167

193

168

text

194

169

Sequence of lines to be added in the new version."""

195

196

self._check_versions(parents)

170

## self._check_versions(parents)

197

171

## self._check_lines(text)

198

new_version = len(self._parents)

172

idx = len(self._v)

199

173

200

174

import sha

201

175

s = sha.new()

202

map(s.update, text)

176

for l in text:

177

s.update(l)

203

178

sha1 = s.hexdigest()

204

179

del s

205

180

206

# if we abort after here the weave will be corrupt

207

self._parents.append(frozenset(parents))

181

if parents:

182

ancestors = self.inclusions(parents)

183

delta = self._delta(ancestors, text)

184

185

# offset gives the number of lines that have been inserted

186

# into the weave up to the current point; if the original edit instruction

187

# says to change line A then we actually change (A+offset)

188

offset = 0

189

190

for i1, i2, newlines in delta:

191

assert 0 <= i1

192

assert i1 <= i2

193

assert i2 <= len(self._l)

194

195

# the deletion and insertion are handled separately.

196

# first delete the region.

197

if i1 != i2:

198

self._l.insert(i1+offset, ('[', idx))

199

self._l.insert(i2+offset+1, (']', idx))

200

offset += 2

201

# is this OK???

202

203

if newlines:

204

# there may have been a deletion spanning up to

205

# i2; we want to insert after this region to make sure

206

# we don't destroy ourselves

207

i = i2 + offset

208

self._l[i:i] = [('{', idx)] \

209

+ newlines \

210

+ [('}', idx)]

211

offset += 2 + len(newlines)

212

213

self._addversion(parents)

214

else:

215

# special case; adding with no parents revision; can do this

216

# more quickly by just appending unconditionally

217

self._l.append(('{', idx))

218

self._l += text

219

self._l.append(('}', idx))

220

221

self._addversion(None)

222

208

223

self._sha1s.append(sha1)

209

210

211

if not parents:

212

# special case; adding with no parents revision; can do

213

# this more quickly by just appending unconditionally.

214

# even more specially, if we're adding an empty text we

215

# need do nothing at all.

216

if text:

217

self._weave.append(('{', new_version))

218

self._weave.extend(text)

219

self._weave.append(('}', new_version))

220

221

return new_version

222

223

if len(parents) == 1:

224

pv = list(parents)[0]

225

if sha1 == self._sha1s[pv]:

226

# special case: same as the single parent

227

return new_version

228

229

230

ancestors = self.inclusions(parents)

231

232

l = self._weave

233

234

# basis a list of (origin, lineno, line)

235

basis_lineno = []

236

basis_lines = []

237

for origin, lineno, line in self._extract(ancestors):

238

basis_lineno.append(lineno)

239

basis_lines.append(line)

240

241

# another small special case: a merge, producing the same text as auto-merge

242

if text == basis_lines:

243

return new_version

244

245

# add a sentinal, because we can also match against the final line

246

basis_lineno.append(len(self._weave))

247

248

# XXX: which line of the weave should we really consider

249

# matches the end of the file? the current code says it's the

250

# last line of the weave?

251

252

#print 'basis_lines:', basis_lines

253

#print 'new_lines: ', lines

254

255

from difflib import SequenceMatcher

256

s = SequenceMatcher(None, basis_lines, text)

257

258

# offset gives the number of lines that have been inserted

259

# into the weave up to the current point; if the original edit instruction

260

# says to change line A then we actually change (A+offset)

261

offset = 0

262

263

for tag, i1, i2, j1, j2 in s.get_opcodes():

264

# i1,i2 are given in offsets within basis_lines; we need to map them

265

# back to offsets within the entire weave

266

#print 'raw match', tag, i1, i2, j1, j2

267

if tag == 'equal':

268

continue

269

270

i1 = basis_lineno[i1]

271

i2 = basis_lineno[i2]

272

273

assert 0 <= j1 <= j2 <= len(text)

274

275

#print tag, i1, i2, j1, j2

276

277

# the deletion and insertion are handled separately.

278

# first delete the region.

279

if i1 != i2:

280

self._weave.insert(i1+offset, ('[', new_version))

281

self._weave.insert(i2+offset+1, (']', new_version))

282

offset += 2

283

284

if j1 != j2:

285

# there may have been a deletion spanning up to

286

# i2; we want to insert after this region to make sure

287

# we don't destroy ourselves

288

i = i2 + offset

289

self._weave[i:i] = ([('{', new_version)]

290

+ text[j1:j2]

291

+ [('}', new_version)])

292

offset += 2 + (j2 - j1)

293

294

return new_version

224

225

return idx

295

226

296

227

297

228

def inclusions(self, versions):

302

233

while v >= 0:

303

234

if v in i:

304

235

# include all its parents

305

i.update(self._parents[v])

236

i.update(self._v[v])

306

237

v -= 1

307

238

return i

308

239

except IndexError:

311

242

312

243

def minimal_parents(self, version):

313

244

"""Find the minimal set of parents for the version."""

314

included = self._parents[version]

245

included = self._v[version]

315

246

if not included:

316

247

return []

317

248

331

262

return mininc

332

263

333

264

265

def _addversion(self, parents):

266

if parents:

267

self._v.append(parents)

268

else:

269

self._v.append(frozenset())

270

334

271

335

272

def _check_lines(self, text):

336

273

if not isinstance(text, list):

347

284

"""Check everything in the sequence of indexes is valid"""

348

285

for i in indexes:

349

286

try:

350

self._parents[i]

287

self._v[i]

351

288

except IndexError:

352

289

raise IndexError("invalid version number %r" % i)

353

290

364

301

yield origin, text

365

302

366

303

367

def _walk(self):

368

"""Walk the weave.

369

370

Yields sequence of

371

(lineno, insert, deletes, text)

372

for each literal line.

373

"""

374

375

istack = []

376

dset = set()

377

378

lineno = 0 # line of weave, 0-based

379

380

for l in self._weave:

381

if isinstance(l, tuple):

382

c, v = l

383

isactive = None

384

if c == '{':

385

istack.append(v)

386

elif c == '}':

387

oldv = istack.pop()

388

elif c == '[':

389

assert v not in dset

390

dset.add(v)

391

elif c == ']':

392

dset.remove(v)

393

else:

394

raise WeaveFormatError('unexpected instruction %r'

395

% v)

396

else:

397

assert isinstance(l, basestring)

398

assert istack

399

yield lineno, istack[-1], dset, l

400

lineno += 1

401

402

403

404

304

def _extract(self, versions):

405

305

"""Yield annotation of lines in included set.

406

306

419

319

420

320

isactive = None

421

321

422

result = []

423

424

322

WFE = WeaveFormatError

425

323

426

for l in self._weave:

324

for l in self._l:

427

325

if isinstance(l, tuple):

428

326

c, v = l

429

327

isactive = None

447

345

if isactive is None:

448

346

isactive = (not dset) and istack and (istack[-1] in included)

449

347

if isactive:

450

result.append((istack[-1], lineno, l))

348

yield istack[-1], lineno, l

451

349

lineno += 1

452

350

453

351

if istack:

457

355

raise WFE("unclosed deletion blocks at end of weave",

458

356

dset)

459

357

460

return result

461

462

463

358

464

359

def get_iter(self, version):

465

360

"""Yield lines for the specified version."""

473

368

474

369

def mash_iter(self, included):

475

370

"""Return composed version of multiple included versions."""

371

included = frozenset(included)

476

372

for origin, lineno, text in self._extract(included):

477

373

yield text

478

374

479

375

480

376

def dump(self, to_file):

481

377

from pprint import pprint

482

print >>to_file, "Weave._weave = ",

483

pprint(self._weave, to_file)

484

print >>to_file, "Weave._parents = ",

485

pprint(self._parents, to_file)

378

print >>to_file, "Weave._l = ",

379

pprint(self._l, to_file)

380

print >>to_file, "Weave._v = ",

381

pprint(self._v, to_file)

486

382

487

383

488

384

489

385

def numversions(self):

490

l = len(self._parents)

386

l = len(self._v)

491

387

assert l == len(self._sha1s)

492

388

return l

493

389

494

390

495

def __len__(self):

496

return self.numversions()

497

498

499

391

def check(self, progress_bar=None):

500

392

# check no circular inclusions

501

393

for version in range(self.numversions()):

502

inclusions = list(self._parents[version])

394

inclusions = list(self._v[version])

503

395

if inclusions:

504

396

inclusions.sort()

505

397

if inclusions[-1] >= version:

565

457

If line1=line2, this is a pure insert; if newlines=[] this is a

566

458

pure delete. (Similar to difflib.)

567

459

"""

568

569

570

571

def plan_merge(self, ver_a, ver_b):

572

"""Return pseudo-annotation indicating how the two versions merge.

573

574

This is computed between versions a and b and their common

575

base.

576

577

Weave lines present in none of them are skipped entirely.

578

"""

579

inc_a = self.inclusions([ver_a])

580

inc_b = self.inclusions([ver_b])

581

inc_c = inc_a & inc_b

582

583

for lineno, insert, deleteset, line in self._walk():

584

if deleteset & inc_c:

585

# killed in parent; can't be in either a or b

586

# not relevant to our work

587

yield 'killed-base', line

588

elif insert in inc_c:

589

# was inserted in base

590

killed_a = bool(deleteset & inc_a)

591

killed_b = bool(deleteset & inc_b)

592

if killed_a and killed_b:

593

yield 'killed-both', line

594

elif killed_a:

595

yield 'killed-a', line

596

elif killed_b:

597

yield 'killed-b', line

598

else:

599

yield 'unchanged', line

600

elif insert in inc_a:

601

if deleteset & inc_a:

602

yield 'ghost-a', line

603

else:

604

# new in A; not in B

605

yield 'new-a', line

606

elif insert in inc_b:

607

if deleteset & inc_b:

608

yield 'ghost-b', line

609

else:

610

yield 'new-b', line

611

else:

612

# not in either revision

613

yield 'irrelevant', line

614

615

yield 'unchanged', '' # terminator

616

617

618

619

def weave_merge(self, plan):

620

lines_a = []

621

lines_b = []

622

ch_a = ch_b = False

623

624

for state, line in plan:

625

if state == 'unchanged' or state == 'killed-both':

626

# resync and flush queued conflicts changes if any

627

if not lines_a and not lines_b:

628

pass

629

elif ch_a and not ch_b:

630

# one-sided change:

631

for l in lines_a: yield l

632

elif ch_b and not ch_a:

633

for l in lines_b: yield l

634

elif lines_a == lines_b:

635

for l in lines_a: yield l

636

else:

637

yield '<<<<\n'

638

for l in lines_a: yield l

639

yield '====\n'

640

for l in lines_b: yield l

641

yield '>>>>\n'

642

643

del lines_a[:]

644

del lines_b[:]

645

ch_a = ch_b = False

646

647

if state == 'unchanged':

648

if line:

649

yield line

650

elif state == 'killed-a':

651

ch_a = True

652

lines_b.append(line)

653

elif state == 'killed-b':

654

ch_b = True

655

lines_a.append(line)

656

elif state == 'new-a':

657

ch_a = True

658

lines_a.append(line)

659

elif state == 'new-b':

660

ch_b = True

661

lines_b.append(line)

662

else:

663

assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',

664

'killed-both'), \

665

state

666

667

668

669

670

671

672

673

def weave_info(w):

460

# basis a list of (origin, lineno, line)

461

basis_lineno = []

462

basis_lines = []

463

for origin, lineno, line in self._extract(included):

464

basis_lineno.append(lineno)

465

basis_lines.append(line)

466

467

# add a sentinal, because we can also match against the final line

468

basis_lineno.append(len(self._l))

469

470

# XXX: which line of the weave should we really consider

471

# matches the end of the file? the current code says it's the

472

# last line of the weave?

473

474

from difflib import SequenceMatcher

475

s = SequenceMatcher(None, basis_lines, lines)

476

477

# TODO: Perhaps return line numbers from composed weave as well?

478

479

for tag, i1, i2, j1, j2 in s.get_opcodes():

480

##print tag, i1, i2, j1, j2

481

482

if tag == 'equal':

483

continue

484

485

# i1,i2 are given in offsets within basis_lines; we need to map them

486

# back to offsets within the entire weave

487

real_i1 = basis_lineno[i1]

488

real_i2 = basis_lineno[i2]

489

490

assert 0 <= j1

491

assert j1 <= j2

492

assert j2 <= len(lines)

493

494

yield real_i1, real_i2, lines[j1:j2]

495

496

497

498

def weave_info(filename, out):

674

499

"""Show some text information about the weave."""

675

print '%6s %40s %20s' % ('ver', 'sha1', 'parents')

676

for i in (6, 40, 20):

677

print '-' * i,

678

679

for i in range(w.numversions()):

680

sha1 = w._sha1s[i]

681

print '%6d %40s %s' % (i, sha1, ' '.join(map(str, w._parents[i])))

682

683

684

685

def weave_stats(weave_file):

686

from bzrlib.progress import ProgressBar

687

from bzrlib.weavefile import read_weave

688

689

pb = ProgressBar()

690

691

wf = file(weave_file, 'rb')

500

from weavefile import read_weave

501

wf = file(filename, 'rb')

692

502

w = read_weave(wf)

693

503

# FIXME: doesn't work on pipes

694

504

weave_size = wf.tell()

505

print >>out, "weave file size %d bytes" % weave_size

506

print >>out, "weave contains %d versions" % len(w._v)

695

507

696

508

total = 0

697

vers = len(w)

698

for i in range(vers):

699

pb.update('checking sizes', i, vers)

700

for line in w.get_iter(i):

701

total += len(line)

702

703

pb.clear()

704

705

print 'versions %9d' % vers

706

print 'weave file %9d bytes' % weave_size

707

print 'total contents %9d bytes' % total

708

print 'compression ratio %9.2fx' % (float(total) / float(weave_size))

709

509

print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')

510

for i in (6, 6, 8, 40, 20):

511

print '-' * i,

512

513

for i in range(len(w._v)):

514

text = w.get(i)

515

lines = len(text)

516

bytes = sum((len(a) for a in text))

517

sha1 = w._sha1s[i]

518

print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),

519

for pv in w._v[i]:

520

print pv,

521

522

total += bytes

523

524

print >>out, "versions total %d bytes" % total

525

print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))

710

526

711

527

712

528

def usage():

810

626

lasto = origin

811

627

812

628

elif cmd == 'info':

813

weave_info(readit())

814

815

elif cmd == 'stats':

816

weave_stats(argv[2])

629

weave_info(argv[2], sys.stdout)

817

630

818

631

elif cmd == 'check':

819

632

w = readit()

820

633

pb = ProgressBar()

821

634

w.check(pb)

822

635

pb.clear()

823

print '%d versions ok' % w.numversions()

824

636

825

637

elif cmd == 'inclusions':

826

638

w = readit()

828

640

829

641

elif cmd == 'parents':

830

642

w = readit()

831

print ' '.join(map(str, w._parents[int(argv[3])]))

832

833

elif cmd == 'plan-merge':

834

w = readit()

835

for state, line in w.plan_merge(int(argv[3]), int(argv[4])):

836

if line:

837

print '%14s | %s' % (state, line),

643

print ' '.join(map(str, w._v[int(argv[3])]))

838

644

839

645

elif cmd == 'merge':

840

w = readit()

841

p = w.plan_merge(int(argv[3]), int(argv[4]))

842

sys.stdout.writelines(w.weave_merge(p))

843

844

elif cmd == 'mash-merge':

845

646

if len(argv) != 5:

846

647

usage()

847

648

return 1

Older »