~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 05:06:36 UTC
Revision ID: mbp@sourcefrog.net-20050409050636-fc5d6b1af98acb3f61d18276

Revfile: compress data going into datafile if that would be worthwhile

files added:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

test.sh

files removed:
HACKING

TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/upgrade.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

notes/revfile.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testbzr

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files modified:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Could also try to mmap things...

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

109

110

FL_GZIP = 1

111

112

# maximum number of patches in a row before recording a whole text.

113

CHAIN_LIMIT = 25

114

115

116

class RevfileError(Exception):

117

pass

118

119

class LimitHitException(Exception):

120

pass

121

122

class Revfile(object):

123

def __init__(self, basename, mode):

class Revfile:

100

def __init__(self, basename):

101

# TODO: Option to open readonly

102

124

103

# TODO: Lock file while open

125

104

126

105

# TODO: advise of random access

127

106

128

107

self.basename = basename

129

130

if mode not in ['r', 'w']:

131

raise RevfileError("invalid open mode %r" % mode)

132

self.mode = mode

133

108

134

109

idxname = basename + '.irev'

135

110

dataname = basename + '.drev'

136

111

112

self.idxpos = 0L

113

137

114

idx_exists = os.path.exists(idxname)

138

115

data_exists = os.path.exists(dataname)

139

116

141

118

raise RevfileError("half-assed revfile")

142

119

143

120

if not idx_exists:

144

if mode == 'r':

145

raise RevfileError("Revfile %r does not exist" % basename)

146

147

121

self.idxfile = open(idxname, 'w+b')

148

122

self.datafile = open(dataname, 'w+b')

149

123

151

125

self.idxfile.write(_HEADER)

152

126

self.idxfile.flush()

153

127

else:

154

if mode == 'r':

155

diskmode = 'rb'

156

else:

157

diskmode = 'r+b'

158

159

self.idxfile = open(idxname, diskmode)

160

self.datafile = open(dataname, diskmode)

128

self.idxfile = open(idxname, 'r+b')

129

self.datafile = open(dataname, 'r+b')

161

130

162

131

h = self.idxfile.read(_RECORDSIZE)

163

132

if h != _HEADER:

165

134

% (h, self.basename))

166

135

167

136

137

138

def revision(self, rev):

139

base = self.index[rev][0]

140

start = self.index[base][1]

141

end = self.index[rev][1] + self.index[rev][2]

142

f = open(self.datafile())

143

144

f.seek(start)

145

data = f.read(end - start)

146

147

last = self.index[base][2]

148

text = zlib.decompress(data[:last])

149

150

for r in range(base + 1, rev + 1):

151

s = self.index[r][2]

152

b = zlib.decompress(data[last:last + s])

153

text = mdiff.bpatch(text, b)

154

last = last + s

155

156

return text

157

158

168

159

def _check_index(self, idx):

169

160

if idx < 0 or idx > len(self):

170

161

raise RevfileError("invalid index %r" % idx)

171

162

172

def _check_write(self):

173

if self.mode != 'w':

174

raise RevfileError("%r is open readonly" % self.basename)

175

176

163

177

164

def find_sha(self, s):

178

165

assert isinstance(s, str)

182

169

if idxrec[I_SHA] == s:

183

170

return idx

184

171

else:

185

return _NO_RECORD

186

187

188

189

def _add_compressed(self, text_sha, data, base, compress):

190

# well, maybe compress

191

flags = 0

192

if compress:

193

data_len = len(data)

194

if data_len > 50:

195

# don't do compression if it's too small; it's unlikely to win

196

# enough to be worthwhile

197

compr_data = zlib.compress(data)

198

compr_len = len(compr_data)

199

if compr_len < data_len:

200

data = compr_data

201

flags = FL_GZIP

202

##print '- compressed %d -> %d, %.1f%%' \

203

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

204

return self._add_raw(text_sha, data, base, flags)

205

206

207

208

def _add_raw(self, text_sha, data, base, flags):

172

return _NO_RECORD

173

174

175

def _add_common(self, text_sha, data, base):

209

176

"""Add pre-processed data, can be either full text or delta.

210

177

211

178

This does the compression if that makes sense."""

179

180

flags = 0

181

if len(data) > 50:

182

# don't do compression if it's too small; it's unlikely to win

183

# enough to be worthwhile

184

compr_data = zlib.compress(data)

185

if len(compr_data) < len(data):

186

data = compr_data

187

flags = FL_GZIP

188

212

189

idx = len(self)

213

190

self.datafile.seek(0, 2) # to end

214

191

self.idxfile.seek(0, 2)

215

192

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

216

193

data_offset = self.datafile.tell()

217

194

218

assert isinstance(data, str) # not unicode or anything weird

195

assert isinstance(data, str) # not unicode or anything wierd

219

196

220

197

self.datafile.write(data)

221

198

self.datafile.flush()

232

209

233

210

234

211

235

def _add_full_text(self, text, text_sha, compress):

212

def _add_full_text(self, text, text_sha):

236

213

"""Add a full text to the file.

237

214

238

215

This is not compressed against any reference version.

239

216

240

217

Returns the index for that text."""

241

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

242

243

244

# NOT USED

245

def _choose_base(self, seed, base):

246

while seed & 3 == 3:

247

if base == _NO_RECORD:

248

return _NO_RECORD

249

idxrec = self[base]

250

if idxrec[I_BASE] == _NO_RECORD:

251

return base

252

253

base = idxrec[I_BASE]

254

seed >>= 2

255

256

return base # relative to this full text

257

258

259

260

def _add_delta(self, text, text_sha, base, compress):

218

return self._add_common(text_sha, text, _NO_RECORD)

219

220

221

def _add_delta(self, text, text_sha, base):

261

222

"""Add a text stored relative to a previous text."""

262

223

self._check_index(base)

263

264

try:

265

base_text = self.get(base, CHAIN_LIMIT)

266

except LimitHitException:

267

return self._add_full_text(text, text_sha, compress)

268

224

base_text = self.get(base)

269

225

data = mdiff.bdiff(base_text, text)

270

271

# If the delta is larger than the text, we might as well just

272

# store the text. (OK, the delta might be more compressible,

273

# but the overhead of applying it probably still makes it

274

# bad, and I don't want to compress both of them to find out.)

275

if len(data) >= len(text):

276

return self._add_full_text(text, text_sha, compress)

277

else:

278

return self._add_compressed(text_sha, data, base, compress)

279

280

281

def add(self, text, base=_NO_RECORD, compress=True):

282

"""Add a new text to the revfile.

283

284

If the text is already present them its existing id is

285

returned and the file is not changed.

286

287

If compress is true then gzip compression will be used if it

288

reduces the size.

289

290

If a base index is specified, that text *may* be used for

291

delta compression of the new text. Delta compression will

292

only be used if it would be a size win and if the existing

293

base is not at too long of a delta chain already.

294

"""

295

self._check_write()

296

226

return self._add_common(text_sha, data, base)

227

228

229

def add(self, text, base=_NO_RECORD):

297

230

text_sha = sha.new(text).digest()

298

231

299

232

idx = self.find_sha(text_sha)

300

233

if idx != _NO_RECORD:

301

# TODO: Optional paranoid mode where we read out that record and make sure

302

# it's the same, in case someone ever breaks SHA-1.

303

234

return idx # already present

304

235

305

# base = self._choose_base(ord(text_sha[0]), base)

306

307

236

if base == _NO_RECORD:

308

return self._add_full_text(text, text_sha, compress)

309

else:

310

return self._add_delta(text, text_sha, base, compress)

311

312

313

314

def get(self, idx, recursion_limit=None):

315

"""Retrieve text of a previous revision.

316

317

If recursion_limit is an integer then walk back at most that

318

many revisions and then raise LimitHitException, indicating

319

that we ought to record a new file text instead of another

320

delta. Don't use this when trying to get out an existing

321

revision."""

322

237

return self._add_full_text(text, text_sha)

238

else:

239

return self._add_delta(text, text_sha, base)

240

241

242

def addrevision(self, text, changeset):

243

t = self.tip()

244

n = t + 1

245

246

if not n % factor:

247

data = zlib.compress(text)

248

base = n

249

else:

250

prev = self.revision(t)

251

data = zlib.compress(mdiff.bdiff(prev, text))

252

base = self.index[t][0]

253

254

offset = 0

255

if t >= 0:

256

offset = self.index[t][1] + self.index[t][2]

257

258

self.index.append((base, offset, len(data), changeset))

259

entry = struct.pack(">llll", base, offset, len(data), changeset)

260

261

open(self.indexfile(), "a").write(entry)

262

open(self.datafile(), "a").write(data)

263

264

265

266

def get(self, idx):

323

267

idxrec = self[idx]

324

268

base = idxrec[I_BASE]

325

269

if base == _NO_RECORD:

326

270

text = self._get_full_text(idx, idxrec)

327

271

else:

328

text = self._get_patched(idx, idxrec, recursion_limit)

272

text = self._get_patched(idx, idxrec)

329

273

330

274

if sha.new(text).digest() != idxrec[I_SHA]:

331

275

raise RevfileError("corrupt SHA-1 digest on record %d"

336

280

337

281

338

282

def _get_raw(self, idx, idxrec):

339

flags = idxrec[I_FLAGS]

340

if flags & ~FL_GZIP:

341

raise RevfileError("unsupported index flags %#x on index %d"

342

% (flags, idx))

343

344

283

l = idxrec[I_LEN]

345

284

if l == 0:

346

285

return ''

353

292

"getting text for record %d in %r"

354

293

% (len(data), l, idx, self.basename))

355

294

356

if flags & FL_GZIP:

357

data = zlib.decompress(data)

358

359

295

return data

360

296

361

297

362

298

def _get_full_text(self, idx, idxrec):

299

assert idxrec[I_FLAGS] == 0

363

300

assert idxrec[I_BASE] == _NO_RECORD

364

301

365

302

text = self._get_raw(idx, idxrec)

367

304

return text

368

305

369

306

370

def _get_patched(self, idx, idxrec, recursion_limit):

307

def _get_patched(self, idx, idxrec):

308

assert idxrec[I_FLAGS] == 0

371

309

base = idxrec[I_BASE]

372

310

assert base >= 0

373

311

assert base < idx # no loops!

374

312

375

if recursion_limit == None:

376

sub_limit = None

377

else:

378

sub_limit = recursion_limit - 1

379

if sub_limit < 0:

380

raise LimitHitException()

381

382

base_text = self.get(base, sub_limit)

313

base_text = self.get(base)

383

314

patch = self._get_raw(idx, idxrec)

384

315

385

316

text = mdiff.bpatch(base_text, patch)

402

333

"""Index by sequence id returns the index field"""

403

334

## TODO: Can avoid seek if we just moved there...

404

335

self._seek_index(idx)

405

idxrec = self._read_next_index()

406

if idxrec == None:

407

raise IndexError("no index %d" % idx)

408

else:

409

return idxrec

336

return self._read_next_index()

410

337

411

338

412

339

def _seek_index(self, idx):

413

340

if idx < 0:

414

341

raise RevfileError("invalid index %r" % idx)

415

342

self.idxfile.seek((idx + 1) * _RECORDSIZE)

416

417

418

419

def __iter__(self):

420

"""Read back all index records.

421

422

Do not seek the index file while this is underway!"""

423

## sys.stderr.write(" ** iter called ** \n")

424

self._seek_index(0)

425

while True:

426

idxrec = self._read_next_index()

427

if not idxrec:

428

break

429

yield idxrec

430

343

431

344

432

345

def _read_next_index(self):

433

346

rec = self.idxfile.read(_RECORDSIZE)

434

347

if not rec:

435

return None

348

raise IndexError("end of index file")

436

349

elif len(rec) != _RECORDSIZE:

437

350

raise RevfileError("short read of %d bytes getting index %d from %r"

438

351

% (len(rec), idx, self.basename))

454

367

f.write("#%-7d " % rec[1])

455

368

456

369

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

457

458

459

def total_text_size(self):

460

"""Return the sum of sizes of all file texts.

461

462

This is how much space they would occupy if they were stored without

463

delta and gzip compression.

464

465

As a side effect this completely validates the Revfile, checking that all

466

texts can be reproduced with the correct SHA-1."""

467

t = 0L

468

for idx in range(len(self)):

469

t += len(self.get(idx))

470

return t

471

370

472

371

473

372

474

373

def main(argv):

374

r = Revfile("testrev")

375

475

376

try:

476

377

cmd = argv[1]

477

filename = argv[2]

478

378

except IndexError:

479

sys.stderr.write("usage: revfile dump REVFILE\n"

480

" revfile add REVFILE < INPUT\n"

481

" revfile add-delta REVFILE BASE < INPUT\n"

482

" revfile add-series REVFILE BASE FILE...\n"

483

" revfile get REVFILE IDX\n"

484

" revfile find-sha REVFILE HEX\n"

485

" revfile total-text-size REVFILE\n"

486

" revfile last REVFILE\n")

379

sys.stderr.write("usage: revfile dump\n"

380

" revfile add\n"

381

" revfile add-delta BASE\n"

382

" revfile get IDX\n"

383

" revfile find-sha HEX\n")

487

384

return 1

488

489

def rw():

490

return Revfile(filename, 'w')

491

492

def ro():

493

return Revfile(filename, 'r')

385

494

386

495

387

if cmd == 'add':

496

print rw().add(sys.stdin.read())

388

new_idx = r.add(sys.stdin.read())

389

print 'added idx %d' % new_idx

497

390

elif cmd == 'add-delta':

498

print rw().add(sys.stdin.read(), int(argv[3]))

499

elif cmd == 'add-series':

500

r = rw()

501

rev = int(argv[3])

502

for fn in argv[4:]:

503

print rev

504

rev = r.add(file(fn).read(), rev)

391

new_idx = r.add(sys.stdin.read(), int(argv[2]))

392

print 'added idx %d' % new_idx

505

393

elif cmd == 'dump':

506

ro().dump()

394

r.dump()

507

395

elif cmd == 'get':

508

396

try:

509

idx = int(argv[3])

397

idx = int(argv[2])

510

398

except IndexError:

511

sys.stderr.write("usage: revfile get FILE IDX\n")

399

sys.stderr.write("usage: revfile get IDX\n")

512

400

return 1

513

401

514

r = ro()

515

516

402

if idx < 0 or idx >= len(r):

517

403

sys.stderr.write("invalid index %r\n" % idx)

518

404

return 1

520

406

sys.stdout.write(r.get(idx))

521

407

elif cmd == 'find-sha':

522

408

try:

523

s = unhexlify(argv[3])

409

s = unhexlify(argv[2])

524

410

except IndexError:

525

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

411

sys.stderr.write("usage: revfile find-sha HEX\n")

526

412

return 1

527

413

528

idx = ro().find_sha(s)

414

idx = r.find_sha(s)

529

415

if idx == _NO_RECORD:

530

416

sys.stderr.write("no such record\n")

531

417

return 1

532

418

else:

533

419

print idx

534

elif cmd == 'total-text-size':

535

print ro().total_text_size()

536

elif cmd == 'last':

537

print len(ro())-1

420

538

421

else:

539

422

sys.stderr.write("unknown command %r\n" % cmd)

540

423

return 1

Older »