~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-08-12 15:41:44 UTC
Revision ID: mbp@sourcefrog.net-20050812154144-bc98570a78b8f633

- merge in deferred revfile work

files added:
HACKING

TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/upgrade.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

notes/revfile.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testbzr

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

test.sh

files modified:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things...

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

109

110

FL_GZIP = 1

111

112

# maximum number of patches in a row before recording a whole text.

113

CHAIN_LIMIT = 25

114

115

116

class RevfileError(Exception):

117

pass

118

class Revfile:

100

def __init__(self, basename):

101

# TODO: Option to open readonly

102

119

class LimitHitException(Exception):

120

pass

121

122

class Revfile(object):

123

def __init__(self, basename, mode):

103

124

# TODO: Lock file while open

104

125

105

126

# TODO: advise of random access

106

127

107

128

self.basename = basename

129

130

if mode not in ['r', 'w']:

131

raise RevfileError("invalid open mode %r" % mode)

132

self.mode = mode

108

133

109

134

idxname = basename + '.irev'

110

135

dataname = basename + '.drev'

111

136

112

self.idxpos = 0L

113

114

137

idx_exists = os.path.exists(idxname)

115

138

data_exists = os.path.exists(dataname)

116

139

118

141

raise RevfileError("half-assed revfile")

119

142

120

143

if not idx_exists:

144

if mode == 'r':

145

raise RevfileError("Revfile %r does not exist" % basename)

146

121

147

self.idxfile = open(idxname, 'w+b')

122

148

self.datafile = open(dataname, 'w+b')

123

149

125

151

self.idxfile.write(_HEADER)

126

152

self.idxfile.flush()

127

153

else:

128

self.idxfile = open(idxname, 'r+b')

129

self.datafile = open(dataname, 'r+b')

154

if mode == 'r':

155

diskmode = 'rb'

156

else:

157

diskmode = 'r+b'

158

159

self.idxfile = open(idxname, diskmode)

160

self.datafile = open(dataname, diskmode)

130

161

131

162

h = self.idxfile.read(_RECORDSIZE)

132

163

if h != _HEADER:

134

165

% (h, self.basename))

135

166

136

167

137

138

def revision(self, rev):

139

base = self.index[rev][0]

140

start = self.index[base][1]

141

end = self.index[rev][1] + self.index[rev][2]

142

f = open(self.datafile())

143

144

f.seek(start)

145

data = f.read(end - start)

146

147

last = self.index[base][2]

148

text = zlib.decompress(data[:last])

149

150

for r in range(base + 1, rev + 1):

151

s = self.index[r][2]

152

b = zlib.decompress(data[last:last + s])

153

text = mdiff.bpatch(text, b)

154

last = last + s

155

156

return text

157

158

159

168

def _check_index(self, idx):

160

169

if idx < 0 or idx > len(self):

161

170

raise RevfileError("invalid index %r" % idx)

162

171

172

def _check_write(self):

173

if self.mode != 'w':

174

raise RevfileError("%r is open readonly" % self.basename)

175

163

176

164

177

def find_sha(self, s):

165

178

assert isinstance(s, str)

169

182

if idxrec[I_SHA] == s:

170

183

return idx

171

184

else:

172

return _NO_RECORD

173

174

175

def _add_common(self, text_sha, data, base):

185

return _NO_RECORD

186

187

188

189

def _add_compressed(self, text_sha, data, base, compress):

190

# well, maybe compress

191

flags = 0

192

if compress:

193

data_len = len(data)

194

if data_len > 50:

195

# don't do compression if it's too small; it's unlikely to win

196

# enough to be worthwhile

197

compr_data = zlib.compress(data)

198

compr_len = len(compr_data)

199

if compr_len < data_len:

200

data = compr_data

201

flags = FL_GZIP

202

##print '- compressed %d -> %d, %.1f%%' \

203

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

204

return self._add_raw(text_sha, data, base, flags)

205

206

207

208

def _add_raw(self, text_sha, data, base, flags):

176

209

"""Add pre-processed data, can be either full text or delta.

177

210

178

211

This does the compression if that makes sense."""

179

180

flags = 0

181

if len(data) > 50:

182

# don't do compression if it's too small; it's unlikely to win

183

# enough to be worthwhile

184

compr_data = zlib.compress(data)

185

if len(compr_data) < len(data):

186

data = compr_data

187

flags = FL_GZIP

188

189

212

idx = len(self)

190

213

self.datafile.seek(0, 2) # to end

191

214

self.idxfile.seek(0, 2)

192

215

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

193

216

data_offset = self.datafile.tell()

194

217

195

assert isinstance(data, str) # not unicode or anything wierd

218

assert isinstance(data, str) # not unicode or anything weird

196

219

197

220

self.datafile.write(data)

198

221

self.datafile.flush()

209

232

210

233

211

234

212

def _add_full_text(self, text, text_sha):

235

def _add_full_text(self, text, text_sha, compress):

213

236

"""Add a full text to the file.

214

237

215

238

This is not compressed against any reference version.

216

239

217

240

Returns the index for that text."""

218

return self._add_common(text_sha, text, _NO_RECORD)

219

220

221

def _add_delta(self, text, text_sha, base):

241

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

242

243

244

# NOT USED

245

def _choose_base(self, seed, base):

246

while seed & 3 == 3:

247

if base == _NO_RECORD:

248

return _NO_RECORD

249

idxrec = self[base]

250

if idxrec[I_BASE] == _NO_RECORD:

251

return base

252

253

base = idxrec[I_BASE]

254

seed >>= 2

255

256

return base # relative to this full text

257

258

259

260

def _add_delta(self, text, text_sha, base, compress):

222

261

"""Add a text stored relative to a previous text."""

223

262

self._check_index(base)

224

base_text = self.get(base)

263

264

try:

265

base_text = self.get(base, CHAIN_LIMIT)

266

except LimitHitException:

267

return self._add_full_text(text, text_sha, compress)

268

225

269

data = mdiff.bdiff(base_text, text)

226

return self._add_common(text_sha, data, base)

227

228

229

def add(self, text, base=_NO_RECORD):

270

271

# If the delta is larger than the text, we might as well just

272

# store the text. (OK, the delta might be more compressible,

273

# but the overhead of applying it probably still makes it

274

# bad, and I don't want to compress both of them to find out.)

275

if len(data) >= len(text):

276

return self._add_full_text(text, text_sha, compress)

277

else:

278

return self._add_compressed(text_sha, data, base, compress)

279

280

281

def add(self, text, base=_NO_RECORD, compress=True):

282

"""Add a new text to the revfile.

283

284

If the text is already present them its existing id is

285

returned and the file is not changed.

286

287

If compress is true then gzip compression will be used if it

288

reduces the size.

289

290

If a base index is specified, that text *may* be used for

291

delta compression of the new text. Delta compression will

292

only be used if it would be a size win and if the existing

293

base is not at too long of a delta chain already.

294

"""

295

self._check_write()

296

230

297

text_sha = sha.new(text).digest()

231

298

232

299

idx = self.find_sha(text_sha)

233

300

if idx != _NO_RECORD:

301

# TODO: Optional paranoid mode where we read out that record and make sure

302

# it's the same, in case someone ever breaks SHA-1.

234

303

return idx # already present

235

304

305

# base = self._choose_base(ord(text_sha[0]), base)

306

236

307

if base == _NO_RECORD:

237

return self._add_full_text(text, text_sha)

238

else:

239

return self._add_delta(text, text_sha, base)

240

241

242

def addrevision(self, text, changeset):

243

t = self.tip()

244

n = t + 1

245

246

if not n % factor:

247

data = zlib.compress(text)

248

base = n

249

else:

250

prev = self.revision(t)

251

data = zlib.compress(mdiff.bdiff(prev, text))

252

base = self.index[t][0]

253

254

offset = 0

255

if t >= 0:

256

offset = self.index[t][1] + self.index[t][2]

257

258

self.index.append((base, offset, len(data), changeset))

259

entry = struct.pack(">llll", base, offset, len(data), changeset)

260

261

open(self.indexfile(), "a").write(entry)

262

open(self.datafile(), "a").write(data)

263

264

265

266

def get(self, idx):

308

return self._add_full_text(text, text_sha, compress)

309

else:

310

return self._add_delta(text, text_sha, base, compress)

311

312

313

314

def get(self, idx, recursion_limit=None):

315

"""Retrieve text of a previous revision.

316

317

If recursion_limit is an integer then walk back at most that

318

many revisions and then raise LimitHitException, indicating

319

that we ought to record a new file text instead of another

320

delta. Don't use this when trying to get out an existing

321

revision."""

322

267

323

idxrec = self[idx]

268

324

base = idxrec[I_BASE]

269

325

if base == _NO_RECORD:

270

326

text = self._get_full_text(idx, idxrec)

271

327

else:

272

text = self._get_patched(idx, idxrec)

328

text = self._get_patched(idx, idxrec, recursion_limit)

273

329

274

330

if sha.new(text).digest() != idxrec[I_SHA]:

275

331

raise RevfileError("corrupt SHA-1 digest on record %d"

280

336

281

337

282

338

def _get_raw(self, idx, idxrec):

339

flags = idxrec[I_FLAGS]

340

if flags & ~FL_GZIP:

341

raise RevfileError("unsupported index flags %#x on index %d"

342

% (flags, idx))

343

283

344

l = idxrec[I_LEN]

284

345

if l == 0:

285

346

return ''

292

353

"getting text for record %d in %r"

293

354

% (len(data), l, idx, self.basename))

294

355

356

if flags & FL_GZIP:

357

data = zlib.decompress(data)

358

295

359

return data

296

360

297

361

298

362

def _get_full_text(self, idx, idxrec):

299

assert idxrec[I_FLAGS] == 0

300

363

assert idxrec[I_BASE] == _NO_RECORD

301

364

302

365

text = self._get_raw(idx, idxrec)

304

367

return text

305

368

306

369

307

def _get_patched(self, idx, idxrec):

308

assert idxrec[I_FLAGS] == 0

370

def _get_patched(self, idx, idxrec, recursion_limit):

309

371

base = idxrec[I_BASE]

310

372

assert base >= 0

311

373

assert base < idx # no loops!

312

374

313

base_text = self.get(base)

375

if recursion_limit == None:

376

sub_limit = None

377

else:

378

sub_limit = recursion_limit - 1

379

if sub_limit < 0:

380

raise LimitHitException()

381

382

base_text = self.get(base, sub_limit)

314

383

patch = self._get_raw(idx, idxrec)

315

384

316

385

text = mdiff.bpatch(base_text, patch)

333

402

"""Index by sequence id returns the index field"""

334

403

## TODO: Can avoid seek if we just moved there...

335

404

self._seek_index(idx)

336

return self._read_next_index()

405

idxrec = self._read_next_index()

406

if idxrec == None:

407

raise IndexError("no index %d" % idx)

408

else:

409

return idxrec

337

410

338

411

339

412

def _seek_index(self, idx):

340

413

if idx < 0:

341

414

raise RevfileError("invalid index %r" % idx)

342

415

self.idxfile.seek((idx + 1) * _RECORDSIZE)

416

417

418

419

def __iter__(self):

420

"""Read back all index records.

421

422

Do not seek the index file while this is underway!"""

423

## sys.stderr.write(" ** iter called ** \n")

424

self._seek_index(0)

425

while True:

426

idxrec = self._read_next_index()

427

if not idxrec:

428

break

429

yield idxrec

343

430

344

431

345

432

def _read_next_index(self):

346

433

rec = self.idxfile.read(_RECORDSIZE)

347

434

if not rec:

348

raise IndexError("end of index file")

435

return None

349

436

elif len(rec) != _RECORDSIZE:

350

437

raise RevfileError("short read of %d bytes getting index %d from %r"

351

438

% (len(rec), idx, self.basename))

367

454

f.write("#%-7d " % rec[1])

368

455

369

456

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

457

458

459

def total_text_size(self):

460

"""Return the sum of sizes of all file texts.

461

462

This is how much space they would occupy if they were stored without

463

delta and gzip compression.

464

465

As a side effect this completely validates the Revfile, checking that all

466

texts can be reproduced with the correct SHA-1."""

467

t = 0L

468

for idx in range(len(self)):

469

t += len(self.get(idx))

470

return t

370

471

371

472

372

473

373

474

def main(argv):

374

r = Revfile("testrev")

375

376

475

try:

377

476

cmd = argv[1]

477

filename = argv[2]

378

478

except IndexError:

379

sys.stderr.write("usage: revfile dump\n"

380

" revfile add\n"

381

" revfile add-delta BASE\n"

382

" revfile get IDX\n"

383

" revfile find-sha HEX\n")

479

sys.stderr.write("usage: revfile dump REVFILE\n"

480

" revfile add REVFILE < INPUT\n"

481

" revfile add-delta REVFILE BASE < INPUT\n"

482

" revfile add-series REVFILE BASE FILE...\n"

483

" revfile get REVFILE IDX\n"

484

" revfile find-sha REVFILE HEX\n"

485

" revfile total-text-size REVFILE\n"

486

" revfile last REVFILE\n")

384

487

return 1

385

488

489

def rw():

490

return Revfile(filename, 'w')

491

492

def ro():

493

return Revfile(filename, 'r')

386

494

387

495

if cmd == 'add':

388

new_idx = r.add(sys.stdin.read())

389

print 'added idx %d' % new_idx

496

print rw().add(sys.stdin.read())

390

497

elif cmd == 'add-delta':

391

new_idx = r.add(sys.stdin.read(), int(argv[2]))

392

print 'added idx %d' % new_idx

498

print rw().add(sys.stdin.read(), int(argv[3]))

499

elif cmd == 'add-series':

500

r = rw()

501

rev = int(argv[3])

502

for fn in argv[4:]:

503

print rev

504

rev = r.add(file(fn).read(), rev)

393

505

elif cmd == 'dump':

394

r.dump()

506

ro().dump()

395

507

elif cmd == 'get':

396

508

try:

397

idx = int(argv[2])

509

idx = int(argv[3])

398

510

except IndexError:

399

sys.stderr.write("usage: revfile get IDX\n")

511

sys.stderr.write("usage: revfile get FILE IDX\n")

400

512

return 1

401

513

514

r = ro()

515

402

516

if idx < 0 or idx >= len(r):

403

517

sys.stderr.write("invalid index %r\n" % idx)

404

518

return 1

406

520

sys.stdout.write(r.get(idx))

407

521

elif cmd == 'find-sha':

408

522

try:

409

s = unhexlify(argv[2])

523

s = unhexlify(argv[3])

410

524

except IndexError:

411

sys.stderr.write("usage: revfile find-sha HEX\n")

525

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

412

526

return 1

413

527

414

idx = r.find_sha(s)

528

idx = ro().find_sha(s)

415

529

if idx == _NO_RECORD:

416

530

sys.stderr.write("no such record\n")

417

531

return 1

418

532

else:

419

533

print idx

420

534

elif cmd == 'total-text-size':

535

print ro().total_text_size()

536

elif cmd == 'last':

537

print len(ro())-1

421

538

else:

422

539

sys.stderr.write("unknown command %r\n" % cmd)

423

540

return 1

Older »