~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-08-12 15:50:29 UTC
Revision ID: mbp@sourcefrog.net-20050812155029-7276404ec89cb402

- cleaner interface to Revfile.add(); don't base revision can be
just None if unknown

files added:
.rsyncexclude

HACKING

TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/newinventory.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/upgrade.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/ignore.txt

doc/quotes.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

notes/new-inventory-sample.xml

notes/revfile.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testbzr

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files removed:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

files modified:
.bzrignore

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/svk.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

notes/performance.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

_RECORDSIZE = 48

100

101

_HEADER = "bzr revfile v1\n"

102

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

103

_NO_RECORD = 0xFFFFFFFFL

104

105

# fields in the index record

106

I_SHA = 0

107

I_BASE = 1

108

I_FLAGS = 2

109

I_OFFSET = 3

110

I_LEN = 4

111

112

FL_GZIP = 1

113

114

# maximum number of patches in a row before recording a whole text.

115

CHAIN_LIMIT = 25

116

117

118

class RevfileError(Exception):

119

pass

120

121

class LimitHitException(Exception):

122

pass

123

124

class Revfile(object):

125

def __init__(self, basename, mode):

126

# TODO: Lock file while open

127

128

# TODO: advise of random access

129

130

self.basename = basename

131

132

if mode not in ['r', 'w']:

133

raise RevfileError("invalid open mode %r" % mode)

134

self.mode = mode

135

136

idxname = basename + '.irev'

137

dataname = basename + '.drev'

138

139

idx_exists = os.path.exists(idxname)

140

data_exists = os.path.exists(dataname)

141

142

if idx_exists != data_exists:

143

raise RevfileError("half-assed revfile")

144

145

if not idx_exists:

146

if mode == 'r':

147

raise RevfileError("Revfile %r does not exist" % basename)

148

149

self.idxfile = open(idxname, 'w+b')

150

self.datafile = open(dataname, 'w+b')

151

152

print 'init empty file'

153

self.idxfile.write(_HEADER)

154

self.idxfile.flush()

155

else:

156

if mode == 'r':

157

diskmode = 'rb'

158

else:

159

diskmode = 'r+b'

160

161

self.idxfile = open(idxname, diskmode)

162

self.datafile = open(dataname, diskmode)

163

164

h = self.idxfile.read(_RECORDSIZE)

165

if h != _HEADER:

166

raise RevfileError("bad header %r in index of %r"

167

% (h, self.basename))

168

169

170

def _check_index(self, idx):

171

if idx < 0 or idx > len(self):

172

raise RevfileError("invalid index %r" % idx)

173

174

def _check_write(self):

175

if self.mode != 'w':

176

raise RevfileError("%r is open readonly" % self.basename)

177

178

179

def find_sha(self, s):

180

assert isinstance(s, str)

181

assert len(s) == 20

182

183

for idx, idxrec in enumerate(self):

184

if idxrec[I_SHA] == s:

185

return idx

186

else:

187

return _NO_RECORD

188

189

190

191

def _add_compressed(self, text_sha, data, base, compress):

192

# well, maybe compress

193

flags = 0

194

if compress:

195

data_len = len(data)

196

if data_len > 50:

197

# don't do compression if it's too small; it's unlikely to win

198

# enough to be worthwhile

199

compr_data = zlib.compress(data)

200

compr_len = len(compr_data)

201

if compr_len < data_len:

202

data = compr_data

203

flags = FL_GZIP

204

##print '- compressed %d -> %d, %.1f%%' \

205

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

206

return self._add_raw(text_sha, data, base, flags)

207

208

209

210

def _add_raw(self, text_sha, data, base, flags):

211

"""Add pre-processed data, can be either full text or delta.

212

213

This does the compression if that makes sense."""

214

idx = len(self)

215

self.datafile.seek(0, 2) # to end

216

self.idxfile.seek(0, 2)

217

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

218

data_offset = self.datafile.tell()

219

220

assert isinstance(data, str) # not unicode or anything weird

221

222

self.datafile.write(data)

223

self.datafile.flush()

224

225

assert isinstance(text_sha, str)

226

entry = text_sha

227

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

228

assert len(entry) == _RECORDSIZE

229

230

self.idxfile.write(entry)

231

self.idxfile.flush()

232

233

return idx

234

235

236

237

def _add_full_text(self, text, text_sha, compress):

238

"""Add a full text to the file.

239

240

This is not compressed against any reference version.

241

242

Returns the index for that text."""

243

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

244

245

246

# NOT USED

247

def _choose_base(self, seed, base):

248

while seed & 3 == 3:

249

if base == _NO_RECORD:

250

return _NO_RECORD

251

idxrec = self[base]

252

if idxrec[I_BASE] == _NO_RECORD:

253

return base

254

255

base = idxrec[I_BASE]

256

seed >>= 2

257

258

return base # relative to this full text

259

260

261

262

def _add_delta(self, text, text_sha, base, compress):

263

"""Add a text stored relative to a previous text."""

264

self._check_index(base)

265

266

try:

267

base_text = self.get(base, CHAIN_LIMIT)

268

except LimitHitException:

269

return self._add_full_text(text, text_sha, compress)

270

271

data = mdiff.bdiff(base_text, text)

272

273

# If the delta is larger than the text, we might as well just

274

# store the text. (OK, the delta might be more compressible,

275

# but the overhead of applying it probably still makes it

276

# bad, and I don't want to compress both of them to find out.)

277

if len(data) >= len(text):

278

return self._add_full_text(text, text_sha, compress)

279

else:

280

return self._add_compressed(text_sha, data, base, compress)

281

282

283

def add(self, text, base=None, compress=True):

284

"""Add a new text to the revfile.

285

286

If the text is already present them its existing id is

287

returned and the file is not changed.

288

289

If compress is true then gzip compression will be used if it

290

reduces the size.

291

292

If a base index is specified, that text *may* be used for

293

delta compression of the new text. Delta compression will

294

only be used if it would be a size win and if the existing

295

base is not at too long of a delta chain already.

296

"""

297

if base == None:

298

base = _NO_RECORD

299

300

self._check_write()

301

302

text_sha = sha.new(text).digest()

303

304

idx = self.find_sha(text_sha)

305

if idx != _NO_RECORD:

306

# TODO: Optional paranoid mode where we read out that record and make sure

307

# it's the same, in case someone ever breaks SHA-1.

308

return idx # already present

309

310

# base = self._choose_base(ord(text_sha[0]), base)

311

312

if base == _NO_RECORD:

313

return self._add_full_text(text, text_sha, compress)

314

else:

315

return self._add_delta(text, text_sha, base, compress)

316

317

318

319

def get(self, idx, recursion_limit=None):

320

"""Retrieve text of a previous revision.

321

322

If recursion_limit is an integer then walk back at most that

323

many revisions and then raise LimitHitException, indicating

324

that we ought to record a new file text instead of another

325

delta. Don't use this when trying to get out an existing

326

revision."""

327

328

idxrec = self[idx]

329

base = idxrec[I_BASE]

330

if base == _NO_RECORD:

331

text = self._get_full_text(idx, idxrec)

332

else:

333

text = self._get_patched(idx, idxrec, recursion_limit)

334

335

if sha.new(text).digest() != idxrec[I_SHA]:

336

raise RevfileError("corrupt SHA-1 digest on record %d"

337

% idx)

338

339

return text

340

341

342

343

def _get_raw(self, idx, idxrec):

344

flags = idxrec[I_FLAGS]

345

if flags & ~FL_GZIP:

346

raise RevfileError("unsupported index flags %#x on index %d"

347

% (flags, idx))

348

349

l = idxrec[I_LEN]

350

if l == 0:

351

return ''

352

353

self.datafile.seek(idxrec[I_OFFSET])

354

355

data = self.datafile.read(l)

356

if len(data) != l:

357

raise RevfileError("short read %d of %d "

358

"getting text for record %d in %r"

359

% (len(data), l, idx, self.basename))

360

361

if flags & FL_GZIP:

362

data = zlib.decompress(data)

363

364

return data

365

366

367

def _get_full_text(self, idx, idxrec):

368

assert idxrec[I_BASE] == _NO_RECORD

369

370

text = self._get_raw(idx, idxrec)

371

372

return text

373

374

375

def _get_patched(self, idx, idxrec, recursion_limit):

376

base = idxrec[I_BASE]

377

assert base >= 0

378

assert base < idx # no loops!

379

380

if recursion_limit == None:

381

sub_limit = None

382

else:

383

sub_limit = recursion_limit - 1

384

if sub_limit < 0:

385

raise LimitHitException()

386

387

base_text = self.get(base, sub_limit)

388

patch = self._get_raw(idx, idxrec)

389

390

text = mdiff.bpatch(base_text, patch)

391

392

return text

393

394

395

396

def __len__(self):

397

"""Return number of revisions."""

398

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

399

if l % _RECORDSIZE:

400

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

401

if l < _RECORDSIZE:

402

raise RevfileError("no header present in index of %r" % (self.basename))

403

return int(l / _RECORDSIZE) - 1

404

405

406

def __getitem__(self, idx):

407

"""Index by sequence id returns the index field"""

408

## TODO: Can avoid seek if we just moved there...

409

self._seek_index(idx)

410

idxrec = self._read_next_index()

411

if idxrec == None:

412

raise IndexError("no index %d" % idx)

413

else:

414

return idxrec

415

416

417

def _seek_index(self, idx):

418

if idx < 0:

419

raise RevfileError("invalid index %r" % idx)

420

self.idxfile.seek((idx + 1) * _RECORDSIZE)

421

422

423

424

def __iter__(self):

425

"""Read back all index records.

426

427

Do not seek the index file while this is underway!"""

428

## sys.stderr.write(" ** iter called ** \n")

429

self._seek_index(0)

430

while True:

431

idxrec = self._read_next_index()

432

if not idxrec:

433

break

434

yield idxrec

435

436

437

def _read_next_index(self):

438

rec = self.idxfile.read(_RECORDSIZE)

439

if not rec:

440

return None

441

elif len(rec) != _RECORDSIZE:

442

raise RevfileError("short read of %d bytes getting index %d from %r"

443

% (len(rec), idx, self.basename))

444

445

return struct.unpack(">20sIIII12x", rec)

446

447

448

def dump(self, f=sys.stdout):

449

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

450

% tuple('idx sha1 base flags offset len'.split()))

451

f.write('-------- ---------------------------------------- ')

452

f.write('-------- -------- -------- --------\n')

453

454

for i, rec in enumerate(self):

455

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

456

if rec[1] == _NO_RECORD:

457

f.write("(none) ")

458

else:

459

f.write("#%-7d " % rec[1])

460

461

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

462

463

464

def total_text_size(self):

465

"""Return the sum of sizes of all file texts.

466

467

This is how much space they would occupy if they were stored without

468

delta and gzip compression.

469

470

As a side effect this completely validates the Revfile, checking that all

471

texts can be reproduced with the correct SHA-1."""

472

t = 0L

473

for idx in range(len(self)):

474

t += len(self.get(idx))

475

return t

476

477

478

479

def main(argv):

480

try:

481

cmd = argv[1]

482

filename = argv[2]

483

except IndexError:

484

sys.stderr.write("usage: revfile dump REVFILE\n"

485

" revfile add REVFILE < INPUT\n"

486

" revfile add-delta REVFILE BASE < INPUT\n"

487

" revfile add-series REVFILE BASE FILE...\n"

488

" revfile get REVFILE IDX\n"

489

" revfile find-sha REVFILE HEX\n"

490

" revfile total-text-size REVFILE\n"

491

" revfile last REVFILE\n")

492

return 1

493

494

def rw():

495

return Revfile(filename, 'w')

496

497

def ro():

498

return Revfile(filename, 'r')

499

500

if cmd == 'add':

501

print rw().add(sys.stdin.read())

502

elif cmd == 'add-delta':

503

print rw().add(sys.stdin.read(), int(argv[3]))

504

elif cmd == 'add-series':

505

r = rw()

506

rev = int(argv[3])

507

for fn in argv[4:]:

508

print rev

509

rev = r.add(file(fn).read(), rev)

510

elif cmd == 'dump':

511

ro().dump()

512

elif cmd == 'get':

513

try:

514

idx = int(argv[3])

515

except IndexError:

516

sys.stderr.write("usage: revfile get FILE IDX\n")

517

return 1

518

519

r = ro()

520

521

if idx < 0 or idx >= len(r):

522

sys.stderr.write("invalid index %r\n" % idx)

523

return 1

524

525

sys.stdout.write(r.get(idx))

526

elif cmd == 'find-sha':

527

try:

528

s = unhexlify(argv[3])

529

except IndexError:

530

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

531

return 1

532

533

idx = ro().find_sha(s)

534

if idx == _NO_RECORD:

535

sys.stderr.write("no such record\n")

536

return 1

537

else:

538

print idx

539

elif cmd == 'total-text-size':

540

print ro().total_text_size()

541

elif cmd == 'last':

542

print len(ro())-1

543

else:

544

sys.stderr.write("unknown command %r\n" % cmd)

545

return 1

546

547

548

if __name__ == '__main__':

549

import sys

550

sys.exit(main(sys.argv) or 0)

Older »