~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 05:35:10 UTC
Revision ID: mbp@sourcefrog.net-20050409053509-41167bb43e000b1d14992189

Revfile: make compression optional, in case people are storing files they know won't compress

files added:
bzrlib/remotebranch.py

bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

test.sh

files removed:
HACKING

Makefile

NEWS.developers

TODO

bzr-man.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/builtins.py

bzrlib/changeset.py

bzrlib/clone.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/delta.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/revisionspec.py

bzrlib/selftest

bzrlib/selftest/HTTPTestUtil.py

bzrlib/selftest/TestUtil.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/test_ancestry.py

bzrlib/selftest/test_bad_files.py

bzrlib/selftest/test_commit.py

bzrlib/selftest/test_commit_merge.py

bzrlib/selftest/test_conflicts.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_revision_info.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/test_upgrade.py

bzrlib/selftest/test_weave.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testannotate.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testconfig.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testgraph.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testidentitymap.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/testrevprops.py

bzrlib/selftest/testsampler.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/selftest/testtestament.py

bzrlib/selftest/testtransactions.py

bzrlib/selftest/testtransport.py

bzrlib/selftest/testworkingtree.py

bzrlib/selftest/treeshape.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/status.py

bzrlib/store

bzrlib/store/compressed_text.py

bzrlib/store/text.py

bzrlib/store/weave.py

bzrlib/testament.py

bzrlib/textinv.py

bzrlib/transactions.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/ui.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches

patches/cache-remote-revisions.diff

patches/cache_weave_inclusions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

testbzr

tools

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/http_client.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
contrib/newinventory.py => bzrlib/newinventory.py

bzrlib/store/__init__.py => bzrlib/store.py

bzrlib/util/elementtree/ => elementtree/

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzr *

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/osutils.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

setup.py

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import os

import sha

import stat

import struct

100

import sys

101

import zlib

# TODO: Could also try to mmap things...

import sys, zlib, struct, mdiff, stat, os, sha

102

from binascii import hexlify, unhexlify

103

104

import bzrlib.mdiff as mdiff

105

factor = 10

106

107

_RECORDSIZE = 48

108

119

120

FL_GZIP = 1

121

122

# maximum number of patches in a row before recording a whole text.

123

CHAIN_LIMIT = 10

124

125

126

class RevfileError(Exception):

127

pass

128

129

class LimitHitException(Exception):

130

pass

131

132

class Revfile(object):

133

def __init__(self, basename, mode):

class Revfile:

100

def __init__(self, basename):

101

# TODO: Option to open readonly

102

134

103

# TODO: Lock file while open

135

104

136

105

# TODO: advise of random access

137

106

138

107

self.basename = basename

139

140

if mode not in ['r', 'w']:

141

raise RevfileError("invalid open mode %r" % mode)

142

self.mode = mode

143

108

144

109

idxname = basename + '.irev'

145

110

dataname = basename + '.drev'

151

116

raise RevfileError("half-assed revfile")

152

117

153

118

if not idx_exists:

154

if mode == 'r':

155

raise RevfileError("Revfile %r does not exist" % basename)

156

157

119

self.idxfile = open(idxname, 'w+b')

158

120

self.datafile = open(dataname, 'w+b')

159

121

122

print 'init empty file'

160

123

self.idxfile.write(_HEADER)

161

124

self.idxfile.flush()

162

125

else:

163

if mode == 'r':

164

diskmode = 'rb'

165

else:

166

diskmode = 'r+b'

167

168

self.idxfile = open(idxname, diskmode)

169

self.datafile = open(dataname, diskmode)

126

self.idxfile = open(idxname, 'r+b')

127

self.datafile = open(dataname, 'r+b')

170

128

171

129

h = self.idxfile.read(_RECORDSIZE)

172

130

if h != _HEADER:

178

136

if idx < 0 or idx > len(self):

179

137

raise RevfileError("invalid index %r" % idx)

180

138

181

def _check_write(self):

182

if self.mode != 'w':

183

raise RevfileError("%r is open readonly" % self.basename)

184

185

139

186

140

def find_sha(self, s):

187

141

assert isinstance(s, str)

224

178

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

225

179

data_offset = self.datafile.tell()

226

180

227

assert isinstance(data, str) # not unicode or anything weird

181

assert isinstance(data, str) # not unicode or anything wierd

228

182

229

183

self.datafile.write(data)

230

184

self.datafile.flush()

241

195

242

196

243

197

244

def _add_full_text(self, text, text_sha, compress):

198

def _add_full_text(self, text, text_sha):

245

199

"""Add a full text to the file.

246

200

247

201

This is not compressed against any reference version.

250

204

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

251

205

252

206

253

# NOT USED

254

def _choose_base(self, seed, base):

255

while seed & 3 == 3:

256

if base == _NO_RECORD:

257

return _NO_RECORD

258

idxrec = self[base]

259

if idxrec[I_BASE] == _NO_RECORD:

260

return base

261

262

base = idxrec[I_BASE]

263

seed >>= 2

264

265

return base # relative to this full text

266

267

268

269

207

def _add_delta(self, text, text_sha, base, compress):

270

208

"""Add a text stored relative to a previous text."""

271

209

self._check_index(base)

272

273

try:

274

base_text = self.get(base, CHAIN_LIMIT)

275

except LimitHitException:

276

return self._add_full_text(text, text_sha, compress)

277

210

base_text = self.get(base)

278

211

data = mdiff.bdiff(base_text, text)

279

280

281

if True: # paranoid early check for bad diff

282

result = mdiff.bpatch(base_text, data)

283

assert result == text

284

285

212

286

213

# If the delta is larger than the text, we might as well just

287

214

# store the text. (OK, the delta might be more compressible,

293

220

return self._add_compressed(text_sha, data, base, compress)

294

221

295

222

296

def add(self, text, base=None, compress=True):

223

def add(self, text, base=_NO_RECORD, compress=True):

297

224

"""Add a new text to the revfile.

298

225

299

226

If the text is already present them its existing id is

307

234

only be used if it would be a size win and if the existing

308

235

base is not at too long of a delta chain already.

309

236

"""

310

if base == None:

311

base = _NO_RECORD

312

313

self._check_write()

314

315

237

text_sha = sha.new(text).digest()

316

238

317

239

idx = self.find_sha(text_sha)

320

242

# it's the same, in case someone ever breaks SHA-1.

321

243

return idx # already present

322

244

323

# base = self._choose_base(ord(text_sha[0]), base)

324

325

245

if base == _NO_RECORD:

326

246

return self._add_full_text(text, text_sha, compress)

327

247

else:

329

249

330

250

331

251

332

def get(self, idx, recursion_limit=None):

333

"""Retrieve text of a previous revision.

334

335

If recursion_limit is an integer then walk back at most that

336

many revisions and then raise LimitHitException, indicating

337

that we ought to record a new file text instead of another

338

delta. Don't use this when trying to get out an existing

339

revision."""

340

252

def get(self, idx):

341

253

idxrec = self[idx]

342

254

base = idxrec[I_BASE]

343

255

if base == _NO_RECORD:

344

256

text = self._get_full_text(idx, idxrec)

345

257

else:

346

text = self._get_patched(idx, idxrec, recursion_limit)

258

text = self._get_patched(idx, idxrec)

347

259

348

260

if sha.new(text).digest() != idxrec[I_SHA]:

349

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

350

% (idx, self.basename))

261

raise RevfileError("corrupt SHA-1 digest on record %d"

262

% idx)

351

263

352

264

return text

353

265

385

297

return text

386

298

387

299

388

def _get_patched(self, idx, idxrec, recursion_limit):

300

def _get_patched(self, idx, idxrec):

389

301

base = idxrec[I_BASE]

390

302

assert base >= 0

391

303

assert base < idx # no loops!

392

304

393

if recursion_limit == None:

394

sub_limit = None

395

else:

396

sub_limit = recursion_limit - 1

397

if sub_limit < 0:

398

raise LimitHitException()

399

400

base_text = self.get(base, sub_limit)

305

base_text = self.get(base)

401

306

patch = self._get_raw(idx, idxrec)

402

307

403

308

text = mdiff.bpatch(base_text, patch)

420

325

"""Index by sequence id returns the index field"""

421

326

## TODO: Can avoid seek if we just moved there...

422

327

self._seek_index(idx)

423

idxrec = self._read_next_index()

424

if idxrec == None:

425

raise IndexError("no index %d" % idx)

426

else:

427

return idxrec

328

return self._read_next_index()

428

329

429

330

430

331

def _seek_index(self, idx):

431

332

if idx < 0:

432

333

raise RevfileError("invalid index %r" % idx)

433

334

self.idxfile.seek((idx + 1) * _RECORDSIZE)

434

435

436

437

def __iter__(self):

438

"""Read back all index records.

439

440

Do not seek the index file while this is underway!"""

441

## sys.stderr.write(" ** iter called ** \n")

442

self._seek_index(0)

443

while True:

444

idxrec = self._read_next_index()

445

if not idxrec:

446

break

447

yield idxrec

448

335

449

336

450

337

def _read_next_index(self):

451

338

rec = self.idxfile.read(_RECORDSIZE)

452

339

if not rec:

453

return None

340

raise IndexError("end of index file")

454

341

elif len(rec) != _RECORDSIZE:

455

342

raise RevfileError("short read of %d bytes getting index %d from %r"

456

343

% (len(rec), idx, self.basename))

472

359

f.write("#%-7d " % rec[1])

473

360

474

361

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

475

476

477

def total_text_size(self):

478

"""Return the sum of sizes of all file texts.

479

480

This is how much space they would occupy if they were stored without

481

delta and gzip compression.

482

483

As a side effect this completely validates the Revfile, checking that all

484

texts can be reproduced with the correct SHA-1."""

485

t = 0L

486

for idx in range(len(self)):

487

t += len(self.get(idx))

488

return t

489

490

491

def check(self, pb=None):

492

"""Extract every version and check its hash."""

493

total = len(self)

494

for i in range(total):

495

if pb:

496

pb.update("check revision", i, total)

497

# the get method implicitly checks the SHA-1

498

self.get(i)

499

if pb:

500

pb.clear()

501

362

502

363

503

364

504

365

def main(argv):

366

r = Revfile("testrev")

367

505

368

try:

506

369

cmd = argv[1]

507

filename = argv[2]

508

370

except IndexError:

509

sys.stderr.write("usage: revfile dump REVFILE\n"

510

" revfile add REVFILE < INPUT\n"

511

" revfile add-delta REVFILE BASE < INPUT\n"

512

" revfile add-series REVFILE BASE FILE...\n"

513

" revfile get REVFILE IDX\n"

514

" revfile find-sha REVFILE HEX\n"

515

" revfile total-text-size REVFILE\n"

516

" revfile last REVFILE\n")

371

sys.stderr.write("usage: revfile dump\n"

372

" revfile add\n"

373

" revfile add-delta BASE\n"

374

" revfile get IDX\n"

375

" revfile find-sha HEX\n")

517

376

return 1

518

519

if filename.endswith('.drev') or filename.endswith('.irev'):

520

filename = filename[:-5]

521

522

def rw():

523

return Revfile(filename, 'w')

524

525

def ro():

526

return Revfile(filename, 'r')

377

527

378

528

379

if cmd == 'add':

529

print rw().add(sys.stdin.read())

380

new_idx = r.add(sys.stdin.read())

381

print new_idx

530

382

elif cmd == 'add-delta':

531

print rw().add(sys.stdin.read(), int(argv[3]))

532

elif cmd == 'add-series':

533

r = rw()

534

rev = int(argv[3])

535

for fn in argv[4:]:

536

print rev

537

rev = r.add(file(fn).read(), rev)

383

new_idx = r.add(sys.stdin.read(), int(argv[2]))

384

print new_idx

538

385

elif cmd == 'dump':

539

ro().dump()

386

r.dump()

540

387

elif cmd == 'get':

541

388

try:

542

idx = int(argv[3])

389

idx = int(argv[2])

543

390

except IndexError:

544

sys.stderr.write("usage: revfile get FILE IDX\n")

391

sys.stderr.write("usage: revfile get IDX\n")

545

392

return 1

546

393

547

r = ro()

548

549

394

if idx < 0 or idx >= len(r):

550

395

sys.stderr.write("invalid index %r\n" % idx)

551

396

return 1

553

398

sys.stdout.write(r.get(idx))

554

399

elif cmd == 'find-sha':

555

400

try:

556

s = unhexlify(argv[3])

401

s = unhexlify(argv[2])

557

402

except IndexError:

558

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

403

sys.stderr.write("usage: revfile find-sha HEX\n")

559

404

return 1

560

405

561

idx = ro().find_sha(s)

406

idx = r.find_sha(s)

562

407

if idx == _NO_RECORD:

563

408

sys.stderr.write("no such record\n")

564

409

return 1

565

410

else:

566

411

print idx

567

elif cmd == 'total-text-size':

568

print ro().total_text_size()

569

elif cmd == 'last':

570

print len(ro())-1

571

elif cmd == 'check':

572

import bzrlib.progress

573

pb = bzrlib.progress.ProgressBar()

574

ro().check(pb)

412

575

413

else:

576

414

sys.stderr.write("unknown command %r\n" % cmd)

577

415

return 1

Older »