~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-07-07 10:22:02 UTC
Revision ID: mbp@sourcefrog.net-20050707102201-2d2a13a25098b101

- rearrange and clear up merged weave

files added:
bzrlib/meta_store.py

bzrlib/remotebranch.py

bzrlib/statcache.py

bzrlib/upgrade.py

patches/annotate3.patch

patches/annotate4.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testsweet.py

files removed:
HACKING

INSTALL

Makefile

NEWS.developers

bzrlib/annotate.py

bzrlib/builtins.py

bzrlib/clone.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/delta.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/identitymap.py

bzrlib/intset.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/plugins/__init__.py

bzrlib/revisionspec.py

bzrlib/selftest/HTTPTestUtil.py

bzrlib/selftest/TestUtil.py

bzrlib/selftest/stub_sftp.py

bzrlib/selftest/test_ancestry.py

bzrlib/selftest/test_bad_files.py

bzrlib/selftest/test_command.py

bzrlib/selftest/test_commit.py

bzrlib/selftest/test_commit_merge.py

bzrlib/selftest/test_conflicts.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_revision_info.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/test_upgrade.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testannotate.py

bzrlib/selftest/testapi.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testconfig.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testgpg.py

bzrlib/selftest/testgraph.py

bzrlib/selftest/testhttp.py

bzrlib/selftest/testidentitymap.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge.py

bzrlib/selftest/testnonascii.py

bzrlib/selftest/testoptions.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/testrevprops.py

bzrlib/selftest/testreweave.py

bzrlib/selftest/testsampler.py

bzrlib/selftest/testsftp.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/selftest/testtestament.py

bzrlib/selftest/testtransactions.py

bzrlib/selftest/testtransport.py

bzrlib/selftest/testtsort.py

bzrlib/selftest/testworkingtree.py

bzrlib/selftest/treeshape.py

bzrlib/shellcomplete.py

bzrlib/store

bzrlib/store/compressed_text.py

bzrlib/store/text.py

bzrlib/store/weave.py

bzrlib/testament.py

bzrlib/transactions.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/ui.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/win32console.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib/emacs

contrib/emacs/bzr-mode.el

doc/split-join-files.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches/cache_weave_inclusions.diff

tools/capture_tree.py

tools/history2revfiles.py

tools/http_client.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
contrib/newinventory.py => bzrlib/newinventory.py

bzrlib/selftest/testplugins.py => bzrlib/selftest/plugins.py

bzrlib/store/__init__.py => bzrlib/store.py

bzrlib/util/effbot/ => effbot/

bzrlib/util/elementtree/ => elementtree/

bzrlib/plugins/ => plugins/

bzrlib/selftest/test_weave.py => tools/testweave.py

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

.rsyncexclude

NEWS

README

TODO

bzr *

bzr-man.py

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/weave.py *

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/pwk

contrib/zsh/_bzr

doc/index.txt

doc/todo-from-arch.txt

setup.py *

tools/convertfile.py

tools/convertinv.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import os

import sha

import stat

import struct

100

import sys

101

import zlib

import sys, zlib, struct, mdiff, stat, os, sha

102

from binascii import hexlify, unhexlify

103

104

import bzrlib.mdiff as mdiff

105

factor = 10

106

107

_RECORDSIZE = 48

108

120

FL_GZIP = 1

121

122

# maximum number of patches in a row before recording a whole text.

123

CHAIN_LIMIT = 10

CHAIN_LIMIT = 50

124

100

125

101

126

102

class RevfileError(Exception):

157

133

self.idxfile = open(idxname, 'w+b')

158

134

self.datafile = open(dataname, 'w+b')

159

135

136

print 'init empty file'

160

137

self.idxfile.write(_HEADER)

161

138

self.idxfile.flush()

162

139

else:

250

227

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

251

228

252

229

253

# NOT USED

254

def _choose_base(self, seed, base):

255

while seed & 3 == 3:

256

if base == _NO_RECORD:

257

return _NO_RECORD

258

idxrec = self[base]

259

if idxrec[I_BASE] == _NO_RECORD:

260

return base

261

262

base = idxrec[I_BASE]

263

seed >>= 2

264

265

return base # relative to this full text

266

267

268

269

230

def _add_delta(self, text, text_sha, base, compress):

270

231

"""Add a text stored relative to a previous text."""

271

232

self._check_index(base)

272

233

273

234

try:

274

base_text = self.get(base, CHAIN_LIMIT)

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

275

236

except LimitHitException:

276

237

return self._add_full_text(text, text_sha, compress)

277

238

278

239

data = mdiff.bdiff(base_text, text)

279

280

281

if True: # paranoid early check for bad diff

282

result = mdiff.bpatch(base_text, data)

283

assert result == text

284

285

240

286

241

# If the delta is larger than the text, we might as well just

287

242

# store the text. (OK, the delta might be more compressible,

293

248

return self._add_compressed(text_sha, data, base, compress)

294

249

295

250

296

def add(self, text, base=None, compress=True):

251

def add(self, text, base=_NO_RECORD, compress=True):

297

252

"""Add a new text to the revfile.

298

253

299

254

If the text is already present them its existing id is

307

262

only be used if it would be a size win and if the existing

308

263

base is not at too long of a delta chain already.

309

264

"""

310

if base == None:

311

base = _NO_RECORD

312

313

265

self._check_write()

314

266

315

267

text_sha = sha.new(text).digest()

320

272

# it's the same, in case someone ever breaks SHA-1.

321

273

return idx # already present

322

274

323

# base = self._choose_base(ord(text_sha[0]), base)

324

325

275

if base == _NO_RECORD:

326

276

return self._add_full_text(text, text_sha, compress)

327

277

else:

346

296

text = self._get_patched(idx, idxrec, recursion_limit)

347

297

348

298

if sha.new(text).digest() != idxrec[I_SHA]:

349

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

350

% (idx, self.basename))

299

raise RevfileError("corrupt SHA-1 digest on record %d"

300

% idx)

351

301

352

302

return text

353

303

422

372

self._seek_index(idx)

423

373

idxrec = self._read_next_index()

424

374

if idxrec == None:

425

raise IndexError("no index %d" % idx)

375

raise IndexError()

426

376

else:

427

377

return idxrec

428

378

438

388

"""Read back all index records.

439

389

440

390

Do not seek the index file while this is underway!"""

441

## sys.stderr.write(" ** iter called ** \n")

391

sys.stderr.write(" ** iter called ** \n")

442

392

self._seek_index(0)

443

393

while True:

444

394

idxrec = self._read_next_index()

486

436

for idx in range(len(self)):

487

437

t += len(self.get(idx))

488

438

return t

489

490

491

def check(self, pb=None):

492

"""Extract every version and check its hash."""

493

total = len(self)

494

for i in range(total):

495

if pb:

496

pb.update("check revision", i, total)

497

# the get method implicitly checks the SHA-1

498

self.get(i)

499

if pb:

500

pb.clear()

501

439

502

440

503

441

504

442

def main(argv):

505

443

try:

506

444

cmd = argv[1]

507

filename = argv[2]

508

445

except IndexError:

509

sys.stderr.write("usage: revfile dump REVFILE\n"

510

" revfile add REVFILE < INPUT\n"

511

" revfile add-delta REVFILE BASE < INPUT\n"

512

" revfile add-series REVFILE BASE FILE...\n"

513

" revfile get REVFILE IDX\n"

514

" revfile find-sha REVFILE HEX\n"

515

" revfile total-text-size REVFILE\n"

516

" revfile last REVFILE\n")

446

sys.stderr.write("usage: revfile dump\n"

447

" revfile add\n"

448

" revfile add-delta BASE\n"

449

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

517

453

return 1

518

454

519

if filename.endswith('.drev') or filename.endswith('.irev'):

520

filename = filename[:-5]

521

522

455

def rw():

523

return Revfile(filename, 'w')

456

return Revfile('testrev', 'w')

524

457

525

458

def ro():

526

return Revfile(filename, 'r')

459

return Revfile('testrev', 'r')

527

460

528

461

if cmd == 'add':

529

462

print rw().add(sys.stdin.read())

530

463

elif cmd == 'add-delta':

531

print rw().add(sys.stdin.read(), int(argv[3]))

532

elif cmd == 'add-series':

533

r = rw()

534

rev = int(argv[3])

535

for fn in argv[4:]:

536

print rev

537

rev = r.add(file(fn).read(), rev)

464

print rw().add(sys.stdin.read(), int(argv[2]))

538

465

elif cmd == 'dump':

539

466

ro().dump()

540

467

elif cmd == 'get':

541

468

try:

542

idx = int(argv[3])

469

idx = int(argv[2])

543

470

except IndexError:

544

sys.stderr.write("usage: revfile get FILE IDX\n")

471

sys.stderr.write("usage: revfile get IDX\n")

545

472

return 1

546

473

547

r = ro()

548

549

474

if idx < 0 or idx >= len(r):

550

475

sys.stderr.write("invalid index %r\n" % idx)

551

476

return 1

552

477

553

sys.stdout.write(r.get(idx))

478

sys.stdout.write(ro().get(idx))

554

479

elif cmd == 'find-sha':

555

480

try:

556

s = unhexlify(argv[3])

481

s = unhexlify(argv[2])

557

482

except IndexError:

558

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

483

sys.stderr.write("usage: revfile find-sha HEX\n")

559

484

return 1

560

485

561

486

idx = ro().find_sha(s)

568

493

print ro().total_text_size()

569

494

elif cmd == 'last':

570

495

print len(ro())-1

571

elif cmd == 'check':

572

import bzrlib.progress

573

pb = bzrlib.progress.ProgressBar()

574

ro().check(pb)

575

496

else:

576

497

sys.stderr.write("unknown command %r\n" % cmd)

577

498

return 1

Older »