~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Robert Collins
Date: 2005-10-06 12:14:01 UTC
mfrom: (1393.1.67)
Revision ID: robertc@robertcollins.net-20051006121401-ce87bcb93909bbdf

merge martins latest

files added:
HACKING

Makefile

NEWS.developers

bzr-man.py

bzrlib/annotate.py

bzrlib/builtins.py

bzrlib/clone.py

bzrlib/delta.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/revisionspec.py

bzrlib/selftest

bzrlib/selftest/HTTPTestUtil.py

bzrlib/selftest/TestUtil.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/test_ancestry.py

bzrlib/selftest/test_bad_files.py

bzrlib/selftest/test_commit.py

bzrlib/selftest/test_commit_merge.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_revision_info.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/test_upgrade.py

bzrlib/selftest/test_weave.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testgraph.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/selftest/testtransport.py

bzrlib/selftest/testworkingtree.py

bzrlib/selftest/treeshape.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/store

bzrlib/store/compressed_text.py

bzrlib/store/text.py

bzrlib/store/weave.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/ui.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib/bash/bzr

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

doc/split-join-files.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches

patches/cache-remote-revisions.diff

patches/cache_weave_inclusions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

tools

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/http_client.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
bzrlib/remotebranch.py

bzrlib/statcache.py

bzrlib/tests.py

files renamed:
bzrlib/store.py => bzrlib/store/__init__.py

elementtree/ => bzrlib/util/elementtree/

urlgrabber/ => bzrlib/util/urlgrabber/

contrib/bash/bzr => contrib/bash/bzr.simple

bzrlib/newinventory.py => contrib/newinventory.py

files modified:
.bzrignore

NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/add-bzr-to-baz

doc/formats.txt

doc/index.txt

doc/todo-from-arch.txt

setup.py

testbzr

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import os

import sha

import stat

import struct

100

import sys

101

import zlib

102

from binascii import hexlify, unhexlify

103

factor = 10

104

import bzrlib.mdiff as mdiff

105

106

107

_RECORDSIZE = 48

108

120

FL_GZIP = 1

121

122

# maximum number of patches in a row before recording a whole text.

CHAIN_LIMIT = 50

123

CHAIN_LIMIT = 10

100

124

101

125

102

126

class RevfileError(Exception):

105

129

class LimitHitException(Exception):

106

130

pass

107

131

108

class Revfile:

132

class Revfile(object):

109

133

def __init__(self, basename, mode):

110

134

# TODO: Lock file while open

111

135

133

157

self.idxfile = open(idxname, 'w+b')

134

158

self.datafile = open(dataname, 'w+b')

135

159

136

print 'init empty file'

137

160

self.idxfile.write(_HEADER)

138

161

self.idxfile.flush()

139

162

else:

227

250

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

228

251

229

252

253

# NOT USED

254

def _choose_base(self, seed, base):

255

while seed & 3 == 3:

256

if base == _NO_RECORD:

257

return _NO_RECORD

258

idxrec = self[base]

259

if idxrec[I_BASE] == _NO_RECORD:

260

return base

261

262

base = idxrec[I_BASE]

263

seed >>= 2

264

265

return base # relative to this full text

266

267

268

230

269

def _add_delta(self, text, text_sha, base, compress):

231

270

"""Add a text stored relative to a previous text."""

232

271

self._check_index(base)

233

272

234

273

try:

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

274

base_text = self.get(base, CHAIN_LIMIT)

236

275

except LimitHitException:

237

276

return self._add_full_text(text, text_sha, compress)

238

277

239

278

data = mdiff.bdiff(base_text, text)

279

280

281

if True: # paranoid early check for bad diff

282

result = mdiff.bpatch(base_text, data)

283

assert result == text

284

240

285

241

286

# If the delta is larger than the text, we might as well just

242

287

# store the text. (OK, the delta might be more compressible,

248

293

return self._add_compressed(text_sha, data, base, compress)

249

294

250

295

251

def add(self, text, base=_NO_RECORD, compress=True):

296

def add(self, text, base=None, compress=True):

252

297

"""Add a new text to the revfile.

253

298

254

299

If the text is already present them its existing id is

262

307

only be used if it would be a size win and if the existing

263

308

base is not at too long of a delta chain already.

264

309

"""

310

if base == None:

311

base = _NO_RECORD

312

265

313

self._check_write()

266

314

267

315

text_sha = sha.new(text).digest()

272

320

# it's the same, in case someone ever breaks SHA-1.

273

321

return idx # already present

274

322

323

# base = self._choose_base(ord(text_sha[0]), base)

324

275

325

if base == _NO_RECORD:

276

326

return self._add_full_text(text, text_sha, compress)

277

327

else:

296

346

text = self._get_patched(idx, idxrec, recursion_limit)

297

347

298

348

if sha.new(text).digest() != idxrec[I_SHA]:

299

raise RevfileError("corrupt SHA-1 digest on record %d"

300

% idx)

349

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

350

% (idx, self.basename))

301

351

302

352

return text

303

353

372

422

self._seek_index(idx)

373

423

idxrec = self._read_next_index()

374

424

if idxrec == None:

375

raise IndexError()

425

raise IndexError("no index %d" % idx)

376

426

else:

377

427

return idxrec

378

428

388

438

"""Read back all index records.

389

439

390

440

Do not seek the index file while this is underway!"""

391

sys.stderr.write(" ** iter called ** \n")

441

## sys.stderr.write(" ** iter called ** \n")

392

442

self._seek_index(0)

393

443

while True:

394

444

idxrec = self._read_next_index()

436

486

for idx in range(len(self)):

437

487

t += len(self.get(idx))

438

488

return t

489

490

491

def check(self, pb=None):

492

"""Extract every version and check its hash."""

493

total = len(self)

494

for i in range(total):

495

if pb:

496

pb.update("check revision", i, total)

497

# the get method implicitly checks the SHA-1

498

self.get(i)

499

if pb:

500

pb.clear()

439

501

440

502

441

503

442

504

def main(argv):

443

505

try:

444

506

cmd = argv[1]

507

filename = argv[2]

445

508

except IndexError:

446

sys.stderr.write("usage: revfile dump\n"

447

" revfile add\n"

448

" revfile add-delta BASE\n"

449

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

509

sys.stderr.write("usage: revfile dump REVFILE\n"

510

" revfile add REVFILE < INPUT\n"

511

" revfile add-delta REVFILE BASE < INPUT\n"

512

" revfile add-series REVFILE BASE FILE...\n"

513

" revfile get REVFILE IDX\n"

514

" revfile find-sha REVFILE HEX\n"

515

" revfile total-text-size REVFILE\n"

516

" revfile last REVFILE\n")

453

517

return 1

454

518

519

if filename.endswith('.drev') or filename.endswith('.irev'):

520

filename = filename[:-5]

521

455

522

def rw():

456

return Revfile('testrev', 'w')

523

return Revfile(filename, 'w')

457

524

458

525

def ro():

459

return Revfile('testrev', 'r')

526

return Revfile(filename, 'r')

460

527

461

528

if cmd == 'add':

462

529

print rw().add(sys.stdin.read())

463

530

elif cmd == 'add-delta':

464

print rw().add(sys.stdin.read(), int(argv[2]))

531

print rw().add(sys.stdin.read(), int(argv[3]))

532

elif cmd == 'add-series':

533

r = rw()

534

rev = int(argv[3])

535

for fn in argv[4:]:

536

print rev

537

rev = r.add(file(fn).read(), rev)

465

538

elif cmd == 'dump':

466

539

ro().dump()

467

540

elif cmd == 'get':

468

541

try:

469

idx = int(argv[2])

542

idx = int(argv[3])

470

543

except IndexError:

471

sys.stderr.write("usage: revfile get IDX\n")

544

sys.stderr.write("usage: revfile get FILE IDX\n")

472

545

return 1

473

546

547

r = ro()

548

474

549

if idx < 0 or idx >= len(r):

475

550

sys.stderr.write("invalid index %r\n" % idx)

476

551

return 1

477

552

478

sys.stdout.write(ro().get(idx))

553

sys.stdout.write(r.get(idx))

479

554

elif cmd == 'find-sha':

480

555

try:

481

s = unhexlify(argv[2])

556

s = unhexlify(argv[3])

482

557

except IndexError:

483

sys.stderr.write("usage: revfile find-sha HEX\n")

558

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

484

559

return 1

485

560

486

561

idx = ro().find_sha(s)

493

568

print ro().total_text_size()

494

569

elif cmd == 'last':

495

570

print len(ro())-1

571

elif cmd == 'check':

572

import bzrlib.progress

573

pb = bzrlib.progress.ProgressBar()

574

ro().check(pb)

496

575

else:

497

576

sys.stderr.write("unknown command %r\n" % cmd)

498

577

return 1

Older »