~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 05:26:51 UTC
Revision ID: mbp@sourcefrog.net-20050409052651-73d6952d368eb58574d2e50b

revfile add and add-delta commands print just the index for use by scripts

files added:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

test.sh

files removed:
HACKING

Makefile

TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/builtins.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/TestUtil.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/ui.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/weavestore.py

bzrlib/workingtree.py

bzrlib/xml5.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

testbzr

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/history2weaves.py

tools/http_client.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
contrib/newinventory.py => bzrlib/newinventory.py

bzrlib/util/elementtree/ => elementtree/

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

setup.py

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

# TODO: Could also try to mmap things...

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

100

101

_HEADER = "bzr revfile v1\n"

111

112

FL_GZIP = 1

113

114

# maximum number of patches in a row before recording a whole text.

115

CHAIN_LIMIT = 10

116

117

118

class RevfileError(Exception):

119

pass

120

121

class LimitHitException(Exception):

122

pass

123

124

class Revfile(object):

125

def __init__(self, basename, mode):

class Revfile:

100

def __init__(self, basename):

101

# TODO: Option to open readonly

102

126

103

# TODO: Lock file while open

127

104

128

105

# TODO: advise of random access

129

106

130

107

self.basename = basename

131

132

if mode not in ['r', 'w']:

133

raise RevfileError("invalid open mode %r" % mode)

134

self.mode = mode

135

108

136

109

idxname = basename + '.irev'

137

110

dataname = basename + '.drev'

143

116

raise RevfileError("half-assed revfile")

144

117

145

118

if not idx_exists:

146

if mode == 'r':

147

raise RevfileError("Revfile %r does not exist" % basename)

148

149

119

self.idxfile = open(idxname, 'w+b')

150

120

self.datafile = open(dataname, 'w+b')

151

121

122

print 'init empty file'

152

123

self.idxfile.write(_HEADER)

153

124

self.idxfile.flush()

154

125

else:

155

if mode == 'r':

156

diskmode = 'rb'

157

else:

158

diskmode = 'r+b'

159

160

self.idxfile = open(idxname, diskmode)

161

self.datafile = open(dataname, diskmode)

126

self.idxfile = open(idxname, 'r+b')

127

self.datafile = open(dataname, 'r+b')

162

128

163

129

h = self.idxfile.read(_RECORDSIZE)

164

130

if h != _HEADER:

170

136

if idx < 0 or idx > len(self):

171

137

raise RevfileError("invalid index %r" % idx)

172

138

173

def _check_write(self):

174

if self.mode != 'w':

175

raise RevfileError("%r is open readonly" % self.basename)

176

177

139

178

140

def find_sha(self, s):

179

141

assert isinstance(s, str)

183

145

if idxrec[I_SHA] == s:

184

146

return idx

185

147

else:

186

return _NO_RECORD

187

188

189

190

def _add_compressed(self, text_sha, data, base, compress):

191

# well, maybe compress

192

flags = 0

193

if compress:

194

data_len = len(data)

195

if data_len > 50:

196

# don't do compression if it's too small; it's unlikely to win

197

# enough to be worthwhile

198

compr_data = zlib.compress(data)

199

compr_len = len(compr_data)

200

if compr_len < data_len:

201

data = compr_data

202

flags = FL_GZIP

203

##print '- compressed %d -> %d, %.1f%%' \

204

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

205

return self._add_raw(text_sha, data, base, flags)

206

207

208

209

def _add_raw(self, text_sha, data, base, flags):

148

return _NO_RECORD

149

150

151

def _add_common(self, text_sha, data, base):

210

152

"""Add pre-processed data, can be either full text or delta.

211

153

212

154

This does the compression if that makes sense."""

155

156

flags = 0

157

data_len = len(data)

158

if data_len > 50:

159

# don't do compression if it's too small; it's unlikely to win

160

# enough to be worthwhile

161

compr_data = zlib.compress(data)

162

compr_len = len(compr_data)

163

if compr_len < data_len:

164

data = compr_data

165

flags = FL_GZIP

166

print '- compressed %d -> %d, %.1f%%' \

167

% (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

168

213

169

idx = len(self)

214

170

self.datafile.seek(0, 2) # to end

215

171

self.idxfile.seek(0, 2)

216

172

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

217

173

data_offset = self.datafile.tell()

218

174

219

assert isinstance(data, str) # not unicode or anything weird

175

assert isinstance(data, str) # not unicode or anything wierd

220

176

221

177

self.datafile.write(data)

222

178

self.datafile.flush()

233

189

234

190

235

191

236

def _add_full_text(self, text, text_sha, compress):

192

def _add_full_text(self, text, text_sha):

237

193

"""Add a full text to the file.

238

194

239

195

This is not compressed against any reference version.

240

196

241

197

Returns the index for that text."""

242

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

243

244

245

# NOT USED

246

def _choose_base(self, seed, base):

247

while seed & 3 == 3:

248

if base == _NO_RECORD:

249

return _NO_RECORD

250

idxrec = self[base]

251

if idxrec[I_BASE] == _NO_RECORD:

252

return base

253

254

base = idxrec[I_BASE]

255

seed >>= 2

256

257

return base # relative to this full text

258

259

260

261

def _add_delta(self, text, text_sha, base, compress):

198

return self._add_common(text_sha, text, _NO_RECORD)

199

200

201

def _add_delta(self, text, text_sha, base):

262

202

"""Add a text stored relative to a previous text."""

263

203

self._check_index(base)

264

265

try:

266

base_text = self.get(base, CHAIN_LIMIT)

267

except LimitHitException:

268

return self._add_full_text(text, text_sha, compress)

269

204

base_text = self.get(base)

270

205

data = mdiff.bdiff(base_text, text)

271

272

273

if True: # paranoid early check for bad diff

274

result = mdiff.bpatch(base_text, data)

275

assert result == text

276

277

206

278

207

# If the delta is larger than the text, we might as well just

279

208

# store the text. (OK, the delta might be more compressible,

280

209

# but the overhead of applying it probably still makes it

281

210

# bad, and I don't want to compress both of them to find out.)

282

211

if len(data) >= len(text):

283

return self._add_full_text(text, text_sha, compress)

212

return self._add_full_text(text, text_sha)

284

213

else:

285

return self._add_compressed(text_sha, data, base, compress)

286

287

288

def add(self, text, base=None, compress=True):

214

return self._add_common(text_sha, data, base)

215

216

217

def add(self, text, base=_NO_RECORD):

289

218

"""Add a new text to the revfile.

290

219

291

220

If the text is already present them its existing id is

292

221

returned and the file is not changed.

293

222

294

If compress is true then gzip compression will be used if it

295

reduces the size.

296

297

223

If a base index is specified, that text *may* be used for

298

224

delta compression of the new text. Delta compression will

299

225

only be used if it would be a size win and if the existing

300

226

base is not at too long of a delta chain already.

301

227

"""

302

if base == None:

303

base = _NO_RECORD

304

305

self._check_write()

306

307

228

text_sha = sha.new(text).digest()

308

229

309

230

idx = self.find_sha(text_sha)

312

233

# it's the same, in case someone ever breaks SHA-1.

313

234

return idx # already present

314

235

315

# base = self._choose_base(ord(text_sha[0]), base)

316

317

236

if base == _NO_RECORD:

318

return self._add_full_text(text, text_sha, compress)

237

return self._add_full_text(text, text_sha)

319

238

else:

320

return self._add_delta(text, text_sha, base, compress)

321

322

323

324

def get(self, idx, recursion_limit=None):

325

"""Retrieve text of a previous revision.

326

327

If recursion_limit is an integer then walk back at most that

328

many revisions and then raise LimitHitException, indicating

329

that we ought to record a new file text instead of another

330

delta. Don't use this when trying to get out an existing

331

revision."""

332

239

return self._add_delta(text, text_sha, base)

240

241

242

243

def get(self, idx):

333

244

idxrec = self[idx]

334

245

base = idxrec[I_BASE]

335

246

if base == _NO_RECORD:

336

247

text = self._get_full_text(idx, idxrec)

337

248

else:

338

text = self._get_patched(idx, idxrec, recursion_limit)

249

text = self._get_patched(idx, idxrec)

339

250

340

251

if sha.new(text).digest() != idxrec[I_SHA]:

341

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

342

% (idx, self.basename))

252

raise RevfileError("corrupt SHA-1 digest on record %d"

253

% idx)

343

254

344

255

return text

345

256

377

288

return text

378

289

379

290

380

def _get_patched(self, idx, idxrec, recursion_limit):

291

def _get_patched(self, idx, idxrec):

381

292

base = idxrec[I_BASE]

382

293

assert base >= 0

383

294

assert base < idx # no loops!

384

295

385

if recursion_limit == None:

386

sub_limit = None

387

else:

388

sub_limit = recursion_limit - 1

389

if sub_limit < 0:

390

raise LimitHitException()

391

392

base_text = self.get(base, sub_limit)

296

base_text = self.get(base)

393

297

patch = self._get_raw(idx, idxrec)

394

298

395

299

text = mdiff.bpatch(base_text, patch)

412

316

"""Index by sequence id returns the index field"""

413

317

## TODO: Can avoid seek if we just moved there...

414

318

self._seek_index(idx)

415

idxrec = self._read_next_index()

416

if idxrec == None:

417

raise IndexError("no index %d" % idx)

418

else:

419

return idxrec

319

return self._read_next_index()

420

320

421

321

422

322

def _seek_index(self, idx):

423

323

if idx < 0:

424

324

raise RevfileError("invalid index %r" % idx)

425

325

self.idxfile.seek((idx + 1) * _RECORDSIZE)

426

427

428

429

def __iter__(self):

430

"""Read back all index records.

431

432

Do not seek the index file while this is underway!"""

433

## sys.stderr.write(" ** iter called ** \n")

434

self._seek_index(0)

435

while True:

436

idxrec = self._read_next_index()

437

if not idxrec:

438

break

439

yield idxrec

440

326

441

327

442

328

def _read_next_index(self):

443

329

rec = self.idxfile.read(_RECORDSIZE)

444

330

if not rec:

445

return None

331

raise IndexError("end of index file")

446

332

elif len(rec) != _RECORDSIZE:

447

333

raise RevfileError("short read of %d bytes getting index %d from %r"

448

334

% (len(rec), idx, self.basename))

464

350

f.write("#%-7d " % rec[1])

465

351

466

352

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

467

468

469

def total_text_size(self):

470

"""Return the sum of sizes of all file texts.

471

472

This is how much space they would occupy if they were stored without

473

delta and gzip compression.

474

475

As a side effect this completely validates the Revfile, checking that all

476

texts can be reproduced with the correct SHA-1."""

477

t = 0L

478

for idx in range(len(self)):

479

t += len(self.get(idx))

480

return t

481

482

483

def check(self, pb=None):

484

"""Extract every version and check its hash."""

485

total = len(self)

486

for i in range(total):

487

if pb:

488

pb.update("check revision", i, total)

489

# the get method implicitly checks the SHA-1

490

self.get(i)

491

if pb:

492

pb.clear()

493

353

494

354

495

355

496

356

def main(argv):

357

r = Revfile("testrev")

358

497

359

try:

498

360

cmd = argv[1]

499

filename = argv[2]

500

361

except IndexError:

501

sys.stderr.write("usage: revfile dump REVFILE\n"

502

" revfile add REVFILE < INPUT\n"

503

" revfile add-delta REVFILE BASE < INPUT\n"

504

" revfile add-series REVFILE BASE FILE...\n"

505

" revfile get REVFILE IDX\n"

506

" revfile find-sha REVFILE HEX\n"

507

" revfile total-text-size REVFILE\n"

508

" revfile last REVFILE\n")

362

sys.stderr.write("usage: revfile dump\n"

363

" revfile add\n"

364

" revfile add-delta BASE\n"

365

" revfile get IDX\n"

366

" revfile find-sha HEX\n")

509

367

return 1

510

511

if filename.endswith('.drev') or filename.endswith('.irev'):

512

filename = filename[:-5]

513

514

def rw():

515

return Revfile(filename, 'w')

516

517

def ro():

518

return Revfile(filename, 'r')

368

519

369

520

370

if cmd == 'add':

521

print rw().add(sys.stdin.read())

371

new_idx = r.add(sys.stdin.read())

372

print new_idx

522

373

elif cmd == 'add-delta':

523

print rw().add(sys.stdin.read(), int(argv[3]))

524

elif cmd == 'add-series':

525

r = rw()

526

rev = int(argv[3])

527

for fn in argv[4:]:

528

print rev

529

rev = r.add(file(fn).read(), rev)

374

new_idx = r.add(sys.stdin.read(), int(argv[2]))

375

print new_idx

530

376

elif cmd == 'dump':

531

ro().dump()

377

r.dump()

532

378

elif cmd == 'get':

533

379

try:

534

idx = int(argv[3])

380

idx = int(argv[2])

535

381

except IndexError:

536

sys.stderr.write("usage: revfile get FILE IDX\n")

382

sys.stderr.write("usage: revfile get IDX\n")

537

383

return 1

538

384

539

r = ro()

540

541

385

if idx < 0 or idx >= len(r):

542

386

sys.stderr.write("invalid index %r\n" % idx)

543

387

return 1

545

389

sys.stdout.write(r.get(idx))

546

390

elif cmd == 'find-sha':

547

391

try:

548

s = unhexlify(argv[3])

392

s = unhexlify(argv[2])

549

393

except IndexError:

550

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

394

sys.stderr.write("usage: revfile find-sha HEX\n")

551

395

return 1

552

396

553

idx = ro().find_sha(s)

397

idx = r.find_sha(s)

554

398

if idx == _NO_RECORD:

555

399

sys.stderr.write("no such record\n")

556

400

return 1

557

401

else:

558

402

print idx

559

elif cmd == 'total-text-size':

560

print ro().total_text_size()

561

elif cmd == 'last':

562

print len(ro())-1

563

elif cmd == 'check':

564

import bzrlib.progress

565

pb = bzrlib.progress.ProgressBar()

566

ro().check(pb)

403

567

404

else:

568

405

sys.stderr.write("unknown command %r\n" % cmd)

569

406

return 1

Older »