~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-06-06 04:47:33 UTC
Revision ID: mbp@sourcefrog.net-20050606044733-e902b05ac1747cd2

- fix invocation of testbzr when giving explicit bzr location

files added:
bzrlib/statcache.py

files removed:
HACKING

bzr-man.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/intset.py

bzrlib/merge3.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/upgrade.py

bzrlib/weave.py

bzrlib/weavefile.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/pwclient.full

contrib/pwk

doc/split-join-files.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

notes/revfile.txt

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
bzrlib/selftest/__init__.py => bzrlib/selftest.py

bzrlib/selftest/whitebox.py => bzrlib/whitebox.py

files modified:
.bzrignore

NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/store.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/add-bzr-to-baz

contrib/create_bzr_rollup.py

contrib/upload-bzr.dev

doc/formats.txt

testbzr

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

110

FL_GZIP = 1

111

112

# maximum number of patches in a row before recording a whole text.

113

CHAIN_LIMIT = 25

CHAIN_LIMIT = 50

114

100

115

101

116

102

class RevfileError(Exception):

241

227

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

242

228

243

229

244

# NOT USED

245

def _choose_base(self, seed, base):

246

while seed & 3 == 3:

247

if base == _NO_RECORD:

248

return _NO_RECORD

249

idxrec = self[base]

250

if idxrec[I_BASE] == _NO_RECORD:

251

return base

252

253

base = idxrec[I_BASE]

254

seed >>= 2

255

256

return base # relative to this full text

257

258

259

260

230

def _add_delta(self, text, text_sha, base, compress):

261

231

"""Add a text stored relative to a previous text."""

262

232

self._check_index(base)

263

233

264

234

try:

265

base_text = self.get(base, CHAIN_LIMIT)

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

266

236

except LimitHitException:

267

237

return self._add_full_text(text, text_sha, compress)

268

238

302

272

# it's the same, in case someone ever breaks SHA-1.

303

273

return idx # already present

304

274

305

# base = self._choose_base(ord(text_sha[0]), base)

306

307

275

if base == _NO_RECORD:

308

276

return self._add_full_text(text, text_sha, compress)

309

277

else:

404

372

self._seek_index(idx)

405

373

idxrec = self._read_next_index()

406

374

if idxrec == None:

407

raise IndexError("no index %d" % idx)

375

raise IndexError()

408

376

else:

409

377

return idxrec

410

378

420

388

"""Read back all index records.

421

389

422

390

Do not seek the index file while this is underway!"""

423

## sys.stderr.write(" ** iter called ** \n")

391

sys.stderr.write(" ** iter called ** \n")

424

392

self._seek_index(0)

425

393

while True:

426

394

idxrec = self._read_next_index()

474

442

def main(argv):

475

443

try:

476

444

cmd = argv[1]

477

filename = argv[2]

478

445

except IndexError:

479

sys.stderr.write("usage: revfile dump REVFILE\n"

480

" revfile add REVFILE < INPUT\n"

481

" revfile add-delta REVFILE BASE < INPUT\n"

482

" revfile add-series REVFILE BASE FILE...\n"

483

" revfile get REVFILE IDX\n"

484

" revfile find-sha REVFILE HEX\n"

485

" revfile total-text-size REVFILE\n"

486

" revfile last REVFILE\n")

446

sys.stderr.write("usage: revfile dump\n"

447

" revfile add\n"

448

" revfile add-delta BASE\n"

449

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

487

453

return 1

488

454

489

455

def rw():

490

return Revfile(filename, 'w')

456

return Revfile('testrev', 'w')

491

457

492

458

def ro():

493

return Revfile(filename, 'r')

459

return Revfile('testrev', 'r')

494

460

495

461

if cmd == 'add':

496

462

print rw().add(sys.stdin.read())

497

463

elif cmd == 'add-delta':

498

print rw().add(sys.stdin.read(), int(argv[3]))

499

elif cmd == 'add-series':

500

r = rw()

501

rev = int(argv[3])

502

for fn in argv[4:]:

503

print rev

504

rev = r.add(file(fn).read(), rev)

464

print rw().add(sys.stdin.read(), int(argv[2]))

505

465

elif cmd == 'dump':

506

466

ro().dump()

507

467

elif cmd == 'get':

508

468

try:

509

idx = int(argv[3])

469

idx = int(argv[2])

510

470

except IndexError:

511

sys.stderr.write("usage: revfile get FILE IDX\n")

471

sys.stderr.write("usage: revfile get IDX\n")

512

472

return 1

513

473

514

r = ro()

515

516

474

if idx < 0 or idx >= len(r):

517

475

sys.stderr.write("invalid index %r\n" % idx)

518

476

return 1

519

477

520

sys.stdout.write(r.get(idx))

478

sys.stdout.write(ro().get(idx))

521

479

elif cmd == 'find-sha':

522

480

try:

523

s = unhexlify(argv[3])

481

s = unhexlify(argv[2])

524

482

except IndexError:

525

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

483

sys.stderr.write("usage: revfile find-sha HEX\n")

526

484

return 1

527

485

528

486

idx = ro().find_sha(s)

Older »