~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 05:14:33 UTC
Revision ID: mbp@sourcefrog.net-20050409051433-575f8625dcbb37fb63b09236

Revfile: handle decompression

files added:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

test.sh

files removed:
TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/help.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/statcache.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/upgrade.py

bzrlib/workingtree.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/revfile-annotation.txt

doc/revfile.txt

doc/switch-in-branch.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

plugins/rsync

plugins/rsync/__init__.py

plugins/rsync/rsync_update.py

testbzr

files modified:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Could also try to mmap things...

import sys, zlib, struct, mdiff, stat, os, sha

FL_GZIP = 1

# maximum number of patches in a row before recording a whole text.

CHAIN_LIMIT = 50

100

101

102

class RevfileError(Exception):

103

pass

104

105

class LimitHitException(Exception):

106

pass

107

108

class Revfile(object):

109

def __init__(self, basename, mode):

class Revfile:

100

def __init__(self, basename):

101

# TODO: Option to open readonly

102

110

103

# TODO: Lock file while open

111

104

112

105

# TODO: advise of random access

113

106

114

107

self.basename = basename

115

116

if mode not in ['r', 'w']:

117

raise RevfileError("invalid open mode %r" % mode)

118

self.mode = mode

119

108

120

109

idxname = basename + '.irev'

121

110

dataname = basename + '.drev'

122

111

112

self.idxpos = 0L

113

123

114

idx_exists = os.path.exists(idxname)

124

115

data_exists = os.path.exists(dataname)

125

116

127

118

raise RevfileError("half-assed revfile")

128

119

129

120

if not idx_exists:

130

if mode == 'r':

131

raise RevfileError("Revfile %r does not exist" % basename)

132

133

121

self.idxfile = open(idxname, 'w+b')

134

122

self.datafile = open(dataname, 'w+b')

135

123

137

125

self.idxfile.write(_HEADER)

138

126

self.idxfile.flush()

139

127

else:

140

if mode == 'r':

141

diskmode = 'rb'

142

else:

143

diskmode = 'r+b'

144

145

self.idxfile = open(idxname, diskmode)

146

self.datafile = open(dataname, diskmode)

128

self.idxfile = open(idxname, 'r+b')

129

self.datafile = open(dataname, 'r+b')

147

130

148

131

h = self.idxfile.read(_RECORDSIZE)

149

132

if h != _HEADER:

151

134

% (h, self.basename))

152

135

153

136

137

138

def revision(self, rev):

139

base = self.index[rev][0]

140

start = self.index[base][1]

141

end = self.index[rev][1] + self.index[rev][2]

142

f = open(self.datafile())

143

144

f.seek(start)

145

data = f.read(end - start)

146

147

last = self.index[base][2]

148

text = zlib.decompress(data[:last])

149

150

for r in range(base + 1, rev + 1):

151

s = self.index[r][2]

152

b = zlib.decompress(data[last:last + s])

153

text = mdiff.bpatch(text, b)

154

last = last + s

155

156

return text

157

158

154

159

def _check_index(self, idx):

155

160

if idx < 0 or idx > len(self):

156

161

raise RevfileError("invalid index %r" % idx)

157

162

158

def _check_write(self):

159

if self.mode != 'w':

160

raise RevfileError("%r is open readonly" % self.basename)

161

162

163

164

def find_sha(self, s):

164

165

assert isinstance(s, str)

168

169

if idxrec[I_SHA] == s:

169

170

return idx

170

171

else:

171

return _NO_RECORD

172

173

174

175

def _add_compressed(self, text_sha, data, base, compress):

176

# well, maybe compress

177

flags = 0

178

if compress:

179

data_len = len(data)

180

if data_len > 50:

181

# don't do compression if it's too small; it's unlikely to win

182

# enough to be worthwhile

183

compr_data = zlib.compress(data)

184

compr_len = len(compr_data)

185

if compr_len < data_len:

186

data = compr_data

187

flags = FL_GZIP

188

##print '- compressed %d -> %d, %.1f%%' \

189

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

190

return self._add_raw(text_sha, data, base, flags)

191

192

193

194

def _add_raw(self, text_sha, data, base, flags):

172

return _NO_RECORD

173

174

175

def _add_common(self, text_sha, data, base):

195

176

"""Add pre-processed data, can be either full text or delta.

196

177

197

178

This does the compression if that makes sense."""

179

180

flags = 0

181

data_len = len(data)

182

if data_len > 50:

183

# don't do compression if it's too small; it's unlikely to win

184

# enough to be worthwhile

185

compr_data = zlib.compress(data)

186

compr_len = len(compr_data)

187

if compr_len < data_len:

188

data = compr_data

189

flags = FL_GZIP

190

print '- compressed %d -> %d, %.1f%%' \

191

% (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

192

198

193

idx = len(self)

199

194

self.datafile.seek(0, 2) # to end

200

195

self.idxfile.seek(0, 2)

201

196

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

202

197

data_offset = self.datafile.tell()

203

198

204

assert isinstance(data, str) # not unicode or anything weird

199

assert isinstance(data, str) # not unicode or anything wierd

205

200

206

201

self.datafile.write(data)

207

202

self.datafile.flush()

218

213

219

214

220

215

221

def _add_full_text(self, text, text_sha, compress):

216

def _add_full_text(self, text, text_sha):

222

217

"""Add a full text to the file.

223

218

224

219

This is not compressed against any reference version.

225

220

226

221

Returns the index for that text."""

227

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

228

229

230

def _add_delta(self, text, text_sha, base, compress):

222

return self._add_common(text_sha, text, _NO_RECORD)

223

224

225

def _add_delta(self, text, text_sha, base):

231

226

"""Add a text stored relative to a previous text."""

232

227

self._check_index(base)

233

234

try:

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

236

except LimitHitException:

237

return self._add_full_text(text, text_sha, compress)

238

228

base_text = self.get(base)

239

229

data = mdiff.bdiff(base_text, text)

240

241

# If the delta is larger than the text, we might as well just

242

# store the text. (OK, the delta might be more compressible,

243

# but the overhead of applying it probably still makes it

244

# bad, and I don't want to compress both of them to find out.)

245

if len(data) >= len(text):

246

return self._add_full_text(text, text_sha, compress)

247

else:

248

return self._add_compressed(text_sha, data, base, compress)

249

250

251

def add(self, text, base=_NO_RECORD, compress=True):

252

"""Add a new text to the revfile.

253

254

If the text is already present them its existing id is

255

returned and the file is not changed.

256

257

If compress is true then gzip compression will be used if it

258

reduces the size.

259

260

If a base index is specified, that text *may* be used for

261

delta compression of the new text. Delta compression will

262

only be used if it would be a size win and if the existing

263

base is not at too long of a delta chain already.

264

"""

265

self._check_write()

266

230

return self._add_common(text_sha, data, base)

231

232

233

def add(self, text, base=_NO_RECORD):

267

234

text_sha = sha.new(text).digest()

268

235

269

236

idx = self.find_sha(text_sha)

270

237

if idx != _NO_RECORD:

271

# TODO: Optional paranoid mode where we read out that record and make sure

272

# it's the same, in case someone ever breaks SHA-1.

273

238

return idx # already present

274

239

275

240

if base == _NO_RECORD:

276

return self._add_full_text(text, text_sha, compress)

277

else:

278

return self._add_delta(text, text_sha, base, compress)

279

280

281

282

def get(self, idx, recursion_limit=None):

283

"""Retrieve text of a previous revision.

284

285

If recursion_limit is an integer then walk back at most that

286

many revisions and then raise LimitHitException, indicating

287

that we ought to record a new file text instead of another

288

delta. Don't use this when trying to get out an existing

289

revision."""

290

241

return self._add_full_text(text, text_sha)

242

else:

243

return self._add_delta(text, text_sha, base)

244

245

246

def addrevision(self, text, changeset):

247

t = self.tip()

248

n = t + 1

249

250

if not n % factor:

251

data = zlib.compress(text)

252

base = n

253

else:

254

prev = self.revision(t)

255

data = zlib.compress(mdiff.bdiff(prev, text))

256

base = self.index[t][0]

257

258

offset = 0

259

if t >= 0:

260

offset = self.index[t][1] + self.index[t][2]

261

262

self.index.append((base, offset, len(data), changeset))

263

entry = struct.pack(">llll", base, offset, len(data), changeset)

264

265

open(self.indexfile(), "a").write(entry)

266

open(self.datafile(), "a").write(data)

267

268

269

270

def get(self, idx):

291

271

idxrec = self[idx]

292

272

base = idxrec[I_BASE]

293

273

if base == _NO_RECORD:

294

274

text = self._get_full_text(idx, idxrec)

295

275

else:

296

text = self._get_patched(idx, idxrec, recursion_limit)

276

text = self._get_patched(idx, idxrec)

297

277

298

278

if sha.new(text).digest() != idxrec[I_SHA]:

299

279

raise RevfileError("corrupt SHA-1 digest on record %d"

335

315

return text

336

316

337

317

338

def _get_patched(self, idx, idxrec, recursion_limit):

318

def _get_patched(self, idx, idxrec):

339

319

base = idxrec[I_BASE]

340

320

assert base >= 0

341

321

assert base < idx # no loops!

342

322

343

if recursion_limit == None:

344

sub_limit = None

345

else:

346

sub_limit = recursion_limit - 1

347

if sub_limit < 0:

348

raise LimitHitException()

349

350

base_text = self.get(base, sub_limit)

323

base_text = self.get(base)

351

324

patch = self._get_raw(idx, idxrec)

352

325

353

326

text = mdiff.bpatch(base_text, patch)

370

343

"""Index by sequence id returns the index field"""

371

344

## TODO: Can avoid seek if we just moved there...

372

345

self._seek_index(idx)

373

idxrec = self._read_next_index()

374

if idxrec == None:

375

raise IndexError()

376

else:

377

return idxrec

346

return self._read_next_index()

378

347

379

348

380

349

def _seek_index(self, idx):

381

350

if idx < 0:

382

351

raise RevfileError("invalid index %r" % idx)

383

352

self.idxfile.seek((idx + 1) * _RECORDSIZE)

384

385

386

387

def __iter__(self):

388

"""Read back all index records.

389

390

Do not seek the index file while this is underway!"""

391

sys.stderr.write(" ** iter called ** \n")

392

self._seek_index(0)

393

while True:

394

idxrec = self._read_next_index()

395

if not idxrec:

396

break

397

yield idxrec

398

353

399

354

400

355

def _read_next_index(self):

401

356

rec = self.idxfile.read(_RECORDSIZE)

402

357

if not rec:

403

return None

358

raise IndexError("end of index file")

404

359

elif len(rec) != _RECORDSIZE:

405

360

raise RevfileError("short read of %d bytes getting index %d from %r"

406

361

% (len(rec), idx, self.basename))

422

377

f.write("#%-7d " % rec[1])

423

378

424

379

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

425

426

427

def total_text_size(self):

428

"""Return the sum of sizes of all file texts.

429

430

This is how much space they would occupy if they were stored without

431

delta and gzip compression.

432

433

As a side effect this completely validates the Revfile, checking that all

434

texts can be reproduced with the correct SHA-1."""

435

t = 0L

436

for idx in range(len(self)):

437

t += len(self.get(idx))

438

return t

439

380

440

381

441

382

442

383

def main(argv):

384

r = Revfile("testrev")

385

443

386

try:

444

387

cmd = argv[1]

445

388

except IndexError:

447

390

" revfile add\n"

448

391

" revfile add-delta BASE\n"

449

392

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

393

" revfile find-sha HEX\n")

453

394

return 1

454

455

def rw():

456

return Revfile('testrev', 'w')

457

458

def ro():

459

return Revfile('testrev', 'r')

395

460

396

461

397

if cmd == 'add':

462

print rw().add(sys.stdin.read())

398

new_idx = r.add(sys.stdin.read())

399

print 'added idx %d' % new_idx

463

400

elif cmd == 'add-delta':

464

print rw().add(sys.stdin.read(), int(argv[2]))

401

new_idx = r.add(sys.stdin.read(), int(argv[2]))

402

print 'added idx %d' % new_idx

465

403

elif cmd == 'dump':

466

ro().dump()

404

r.dump()

467

405

elif cmd == 'get':

468

406

try:

469

407

idx = int(argv[2])

475

413

sys.stderr.write("invalid index %r\n" % idx)

476

414

return 1

477

415

478

sys.stdout.write(ro().get(idx))

416

sys.stdout.write(r.get(idx))

479

417

elif cmd == 'find-sha':

480

418

try:

481

419

s = unhexlify(argv[2])

483

421

sys.stderr.write("usage: revfile find-sha HEX\n")

484

422

return 1

485

423

486

idx = ro().find_sha(s)

424

idx = r.find_sha(s)

487

425

if idx == _NO_RECORD:

488

426

sys.stderr.write("no such record\n")

489

427

return 1

490

428

else:

491

429

print idx

492

elif cmd == 'total-text-size':

493

print ro().total_text_size()

494

elif cmd == 'last':

495

print len(ro())-1

430

496

431

else:

497

432

sys.stderr.write("unknown command %r\n" % cmd)

498

433

return 1

Older »