~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-08-12 15:41:44 UTC
Revision ID: mbp@sourcefrog.net-20050812154144-bc98570a78b8f633

- merge in deferred revfile work

files added:
.rsyncexclude

HACKING

TODO

bzr-man.py

bzrlib/atomicfile.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/newinventory.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/upgrade.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/quotes.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

notes/new-inventory-sample.xml

notes/revfile.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testbzr

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files removed:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

files modified:
.bzrignore

NEWS

README

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/bitkeeper.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/svk.txt

doc/tagging.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

notes/performance.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

100

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

101

_NO_RECORD = 0xFFFFFFFFL

102

103

# fields in the index record

104

I_SHA = 0

105

I_BASE = 1

106

I_FLAGS = 2

107

I_OFFSET = 3

108

I_LEN = 4

109

110

FL_GZIP = 1

111

112

# maximum number of patches in a row before recording a whole text.

113

CHAIN_LIMIT = 25

114

115

116

class RevfileError(Exception):

117

pass

118

119

class LimitHitException(Exception):

120

pass

121

122

class Revfile(object):

123

def __init__(self, basename, mode):

124

# TODO: Lock file while open

125

126

# TODO: advise of random access

127

128

self.basename = basename

129

130

if mode not in ['r', 'w']:

131

raise RevfileError("invalid open mode %r" % mode)

132

self.mode = mode

133

134

idxname = basename + '.irev'

135

dataname = basename + '.drev'

136

137

idx_exists = os.path.exists(idxname)

138

data_exists = os.path.exists(dataname)

139

140

if idx_exists != data_exists:

141

raise RevfileError("half-assed revfile")

142

143

if not idx_exists:

144

if mode == 'r':

145

raise RevfileError("Revfile %r does not exist" % basename)

146

147

self.idxfile = open(idxname, 'w+b')

148

self.datafile = open(dataname, 'w+b')

149

150

print 'init empty file'

151

self.idxfile.write(_HEADER)

152

self.idxfile.flush()

153

else:

154

if mode == 'r':

155

diskmode = 'rb'

156

else:

157

diskmode = 'r+b'

158

159

self.idxfile = open(idxname, diskmode)

160

self.datafile = open(dataname, diskmode)

161

162

h = self.idxfile.read(_RECORDSIZE)

163

if h != _HEADER:

164

raise RevfileError("bad header %r in index of %r"

165

% (h, self.basename))

166

167

168

def _check_index(self, idx):

169

if idx < 0 or idx > len(self):

170

raise RevfileError("invalid index %r" % idx)

171

172

def _check_write(self):

173

if self.mode != 'w':

174

raise RevfileError("%r is open readonly" % self.basename)

175

176

177

def find_sha(self, s):

178

assert isinstance(s, str)

179

assert len(s) == 20

180

181

for idx, idxrec in enumerate(self):

182

if idxrec[I_SHA] == s:

183

return idx

184

else:

185

return _NO_RECORD

186

187

188

189

def _add_compressed(self, text_sha, data, base, compress):

190

# well, maybe compress

191

flags = 0

192

if compress:

193

data_len = len(data)

194

if data_len > 50:

195

# don't do compression if it's too small; it's unlikely to win

196

# enough to be worthwhile

197

compr_data = zlib.compress(data)

198

compr_len = len(compr_data)

199

if compr_len < data_len:

200

data = compr_data

201

flags = FL_GZIP

202

##print '- compressed %d -> %d, %.1f%%' \

203

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

204

return self._add_raw(text_sha, data, base, flags)

205

206

207

208

def _add_raw(self, text_sha, data, base, flags):

209

"""Add pre-processed data, can be either full text or delta.

210

211

This does the compression if that makes sense."""

212

idx = len(self)

213

self.datafile.seek(0, 2) # to end

214

self.idxfile.seek(0, 2)

215

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

216

data_offset = self.datafile.tell()

217

218

assert isinstance(data, str) # not unicode or anything weird

219

220

self.datafile.write(data)

221

self.datafile.flush()

222

223

assert isinstance(text_sha, str)

224

entry = text_sha

225

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

226

assert len(entry) == _RECORDSIZE

227

228

self.idxfile.write(entry)

229

self.idxfile.flush()

230

231

return idx

232

233

234

235

def _add_full_text(self, text, text_sha, compress):

236

"""Add a full text to the file.

237

238

This is not compressed against any reference version.

239

240

Returns the index for that text."""

241

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

242

243

244

# NOT USED

245

def _choose_base(self, seed, base):

246

while seed & 3 == 3:

247

if base == _NO_RECORD:

248

return _NO_RECORD

249

idxrec = self[base]

250

if idxrec[I_BASE] == _NO_RECORD:

251

return base

252

253

base = idxrec[I_BASE]

254

seed >>= 2

255

256

return base # relative to this full text

257

258

259

260

def _add_delta(self, text, text_sha, base, compress):

261

"""Add a text stored relative to a previous text."""

262

self._check_index(base)

263

264

try:

265

base_text = self.get(base, CHAIN_LIMIT)

266

except LimitHitException:

267

return self._add_full_text(text, text_sha, compress)

268

269

data = mdiff.bdiff(base_text, text)

270

271

# If the delta is larger than the text, we might as well just

272

# store the text. (OK, the delta might be more compressible,

273

# but the overhead of applying it probably still makes it

274

# bad, and I don't want to compress both of them to find out.)

275

if len(data) >= len(text):

276

return self._add_full_text(text, text_sha, compress)

277

else:

278

return self._add_compressed(text_sha, data, base, compress)

279

280

281

def add(self, text, base=_NO_RECORD, compress=True):

282

"""Add a new text to the revfile.

283

284

If the text is already present them its existing id is

285

returned and the file is not changed.

286

287

If compress is true then gzip compression will be used if it

288

reduces the size.

289

290

If a base index is specified, that text *may* be used for

291

delta compression of the new text. Delta compression will

292

only be used if it would be a size win and if the existing

293

base is not at too long of a delta chain already.

294

"""

295

self._check_write()

296

297

text_sha = sha.new(text).digest()

298

299

idx = self.find_sha(text_sha)

300

if idx != _NO_RECORD:

301

# TODO: Optional paranoid mode where we read out that record and make sure

302

# it's the same, in case someone ever breaks SHA-1.

303

return idx # already present

304

305

# base = self._choose_base(ord(text_sha[0]), base)

306

307

if base == _NO_RECORD:

308

return self._add_full_text(text, text_sha, compress)

309

else:

310

return self._add_delta(text, text_sha, base, compress)

311

312

313

314

def get(self, idx, recursion_limit=None):

315

"""Retrieve text of a previous revision.

316

317

If recursion_limit is an integer then walk back at most that

318

many revisions and then raise LimitHitException, indicating

319

that we ought to record a new file text instead of another

320

delta. Don't use this when trying to get out an existing

321

revision."""

322

323

idxrec = self[idx]

324

base = idxrec[I_BASE]

325

if base == _NO_RECORD:

326

text = self._get_full_text(idx, idxrec)

327

else:

328

text = self._get_patched(idx, idxrec, recursion_limit)

329

330

if sha.new(text).digest() != idxrec[I_SHA]:

331

raise RevfileError("corrupt SHA-1 digest on record %d"

332

% idx)

333

334

return text

335

336

337

338

def _get_raw(self, idx, idxrec):

339

flags = idxrec[I_FLAGS]

340

if flags & ~FL_GZIP:

341

raise RevfileError("unsupported index flags %#x on index %d"

342

% (flags, idx))

343

344

l = idxrec[I_LEN]

345

if l == 0:

346

return ''

347

348

self.datafile.seek(idxrec[I_OFFSET])

349

350

data = self.datafile.read(l)

351

if len(data) != l:

352

raise RevfileError("short read %d of %d "

353

"getting text for record %d in %r"

354

% (len(data), l, idx, self.basename))

355

356

if flags & FL_GZIP:

357

data = zlib.decompress(data)

358

359

return data

360

361

362

def _get_full_text(self, idx, idxrec):

363

assert idxrec[I_BASE] == _NO_RECORD

364

365

text = self._get_raw(idx, idxrec)

366

367

return text

368

369

370

def _get_patched(self, idx, idxrec, recursion_limit):

371

base = idxrec[I_BASE]

372

assert base >= 0

373

assert base < idx # no loops!

374

375

if recursion_limit == None:

376

sub_limit = None

377

else:

378

sub_limit = recursion_limit - 1

379

if sub_limit < 0:

380

raise LimitHitException()

381

382

base_text = self.get(base, sub_limit)

383

patch = self._get_raw(idx, idxrec)

384

385

text = mdiff.bpatch(base_text, patch)

386

387

return text

388

389

390

391

def __len__(self):

392

"""Return number of revisions."""

393

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

394

if l % _RECORDSIZE:

395

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

396

if l < _RECORDSIZE:

397

raise RevfileError("no header present in index of %r" % (self.basename))

398

return int(l / _RECORDSIZE) - 1

399

400

401

def __getitem__(self, idx):

402

"""Index by sequence id returns the index field"""

403

## TODO: Can avoid seek if we just moved there...

404

self._seek_index(idx)

405

idxrec = self._read_next_index()

406

if idxrec == None:

407

raise IndexError("no index %d" % idx)

408

else:

409

return idxrec

410

411

412

def _seek_index(self, idx):

413

if idx < 0:

414

raise RevfileError("invalid index %r" % idx)

415

self.idxfile.seek((idx + 1) * _RECORDSIZE)

416

417

418

419

def __iter__(self):

420

"""Read back all index records.

421

422

Do not seek the index file while this is underway!"""

423

## sys.stderr.write(" ** iter called ** \n")

424

self._seek_index(0)

425

while True:

426

idxrec = self._read_next_index()

427

if not idxrec:

428

break

429

yield idxrec

430

431

432

def _read_next_index(self):

433

rec = self.idxfile.read(_RECORDSIZE)

434

if not rec:

435

return None

436

elif len(rec) != _RECORDSIZE:

437

raise RevfileError("short read of %d bytes getting index %d from %r"

438

% (len(rec), idx, self.basename))

439

440

return struct.unpack(">20sIIII12x", rec)

441

442

443

def dump(self, f=sys.stdout):

444

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

445

% tuple('idx sha1 base flags offset len'.split()))

446

f.write('-------- ---------------------------------------- ')

447

f.write('-------- -------- -------- --------\n')

448

449

for i, rec in enumerate(self):

450

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

451

if rec[1] == _NO_RECORD:

452

f.write("(none) ")

453

else:

454

f.write("#%-7d " % rec[1])

455

456

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

457

458

459

def total_text_size(self):

460

"""Return the sum of sizes of all file texts.

461

462

This is how much space they would occupy if they were stored without

463

delta and gzip compression.

464

465

As a side effect this completely validates the Revfile, checking that all

466

texts can be reproduced with the correct SHA-1."""

467

t = 0L

468

for idx in range(len(self)):

469

t += len(self.get(idx))

470

return t

471

472

473

474

def main(argv):

475

try:

476

cmd = argv[1]

477

filename = argv[2]

478

except IndexError:

479

sys.stderr.write("usage: revfile dump REVFILE\n"

480

" revfile add REVFILE < INPUT\n"

481

" revfile add-delta REVFILE BASE < INPUT\n"

482

" revfile add-series REVFILE BASE FILE...\n"

483

" revfile get REVFILE IDX\n"

484

" revfile find-sha REVFILE HEX\n"

485

" revfile total-text-size REVFILE\n"

486

" revfile last REVFILE\n")

487

return 1

488

489

def rw():

490

return Revfile(filename, 'w')

491

492

def ro():

493

return Revfile(filename, 'r')

494

495

if cmd == 'add':

496

print rw().add(sys.stdin.read())

497

elif cmd == 'add-delta':

498

print rw().add(sys.stdin.read(), int(argv[3]))

499

elif cmd == 'add-series':

500

r = rw()

501

rev = int(argv[3])

502

for fn in argv[4:]:

503

print rev

504

rev = r.add(file(fn).read(), rev)

505

elif cmd == 'dump':

506

ro().dump()

507

elif cmd == 'get':

508

try:

509

idx = int(argv[3])

510

except IndexError:

511

sys.stderr.write("usage: revfile get FILE IDX\n")

512

return 1

513

514

r = ro()

515

516

if idx < 0 or idx >= len(r):

517

sys.stderr.write("invalid index %r\n" % idx)

518

return 1

519

520

sys.stdout.write(r.get(idx))

521

elif cmd == 'find-sha':

522

try:

523

s = unhexlify(argv[3])

524

except IndexError:

525

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

526

return 1

527

528

idx = ro().find_sha(s)

529

if idx == _NO_RECORD:

530

sys.stderr.write("no such record\n")

531

return 1

532

else:

533

print idx

534

elif cmd == 'total-text-size':

535

print ro().total_text_size()

536

elif cmd == 'last':

537

print len(ro())-1

538

else:

539

sys.stderr.write("unknown command %r\n" % cmd)

540

return 1

541

542

543

if __name__ == '__main__':

544

import sys

545

sys.exit(main(sys.argv) or 0)

Older »