~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-11-18 08:10:07 UTC
mfrom: (1393.4.2)
mto: (1185.74.1 bzr.dev (Main development branch)) (1508.1.15) (1505.1.19 bzr-bound-branch) (1526.1.3 run_tests_twice_for_i18n) (1540.1.3 bzr.dev (Main development branch)) (1685.1.1 bzr-encoding) (1534.1.1 integration) (1553.5.1 bzr.dev (Main development branch)) (1608.2.1 bzr.mbp.escape-stores)
mto: This revision was merged to the branch mainline in revision 1509.
Revision ID: mbp@sourcefrog.net-20051118081007-80523bf145eb319b

[merge] fix \t in commit messages

files added:
INSTALL

NEWS.developers

bzrlib/annotate.py

bzrlib/clone.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/identitymap.py

bzrlib/option.py

bzrlib/revisionspec.py

bzrlib/selftest/HTTPTestUtil.py

bzrlib/selftest/stub_sftp.py

bzrlib/selftest/test_bad_files.py

bzrlib/selftest/test_command.py

bzrlib/selftest/test_commit.py

bzrlib/selftest/test_commit_merge.py

bzrlib/selftest/test_conflicts.py

bzrlib/selftest/test_revision_info.py

bzrlib/selftest/test_upgrade.py

bzrlib/selftest/testannotate.py

bzrlib/selftest/testapi.py

bzrlib/selftest/testconfig.py

bzrlib/selftest/testgpg.py

bzrlib/selftest/testgraph.py

bzrlib/selftest/testhttp.py

bzrlib/selftest/testidentitymap.py

bzrlib/selftest/testmerge.py

bzrlib/selftest/testnonascii.py

bzrlib/selftest/testoptions.py

bzrlib/selftest/testrevprops.py

bzrlib/selftest/testreweave.py

bzrlib/selftest/testsampler.py

bzrlib/selftest/testsftp.py

bzrlib/selftest/testtestament.py

bzrlib/selftest/testtrace.py

bzrlib/selftest/testtransactions.py

bzrlib/selftest/testtransport.py

bzrlib/selftest/testtsort.py

bzrlib/selftest/testworkingtree.py

bzrlib/selftest/treeshape.py

bzrlib/store

bzrlib/store/text.py

bzrlib/testament.py

bzrlib/transactions.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/win32console.py

bzrlib/xml4.py

tools/capture_tree.py

tools/trace-revisions

files removed:
bzrlib/mdiff.py

bzrlib/meta_store.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/upgrade.py

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

testbzr

testsweet.py

files renamed:
bzr-man.py => bzr_man.py

tools/testweave.py => bzrlib/selftest/test_weave.py

bzrlib/selftest/plugins.py => bzrlib/selftest/testplugins.py

bzrlib/store.py => bzrlib/store/__init__.py

bzrlib/weavestore.py => bzrlib/store/weave.py

tools/history2weaves.py => bzrlib/upgrade.py

files modified:
.bzrignore

.rsyncexclude

HACKING

Makefile

NEWS

README

TODO

build-api

bzr *

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/revision.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/test_ancestry.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/weave.py *

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml.py

bzrlib/xml5.py

contrib/newinventory.py

contrib/zsh/_bzr

doc/random.txt

setup.py *

tutorial.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

_RECORDSIZE = 48

100

101

_HEADER = "bzr revfile v1\n"

102

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

103

_NO_RECORD = 0xFFFFFFFFL

104

105

# fields in the index record

106

I_SHA = 0

107

I_BASE = 1

108

I_FLAGS = 2

109

I_OFFSET = 3

110

I_LEN = 4

111

112

FL_GZIP = 1

113

114

# maximum number of patches in a row before recording a whole text.

115

CHAIN_LIMIT = 10

116

117

118

class RevfileError(Exception):

119

pass

120

121

class LimitHitException(Exception):

122

pass

123

124

class Revfile(object):

125

def __init__(self, basename, mode):

126

# TODO: Lock file while open

127

128

# TODO: advise of random access

129

130

self.basename = basename

131

132

if mode not in ['r', 'w']:

133

raise RevfileError("invalid open mode %r" % mode)

134

self.mode = mode

135

136

idxname = basename + '.irev'

137

dataname = basename + '.drev'

138

139

idx_exists = os.path.exists(idxname)

140

data_exists = os.path.exists(dataname)

141

142

if idx_exists != data_exists:

143

raise RevfileError("half-assed revfile")

144

145

if not idx_exists:

146

if mode == 'r':

147

raise RevfileError("Revfile %r does not exist" % basename)

148

149

self.idxfile = open(idxname, 'w+b')

150

self.datafile = open(dataname, 'w+b')

151

152

self.idxfile.write(_HEADER)

153

self.idxfile.flush()

154

else:

155

if mode == 'r':

156

diskmode = 'rb'

157

else:

158

diskmode = 'r+b'

159

160

self.idxfile = open(idxname, diskmode)

161

self.datafile = open(dataname, diskmode)

162

163

h = self.idxfile.read(_RECORDSIZE)

164

if h != _HEADER:

165

raise RevfileError("bad header %r in index of %r"

166

% (h, self.basename))

167

168

169

def _check_index(self, idx):

170

if idx < 0 or idx > len(self):

171

raise RevfileError("invalid index %r" % idx)

172

173

def _check_write(self):

174

if self.mode != 'w':

175

raise RevfileError("%r is open readonly" % self.basename)

176

177

178

def find_sha(self, s):

179

assert isinstance(s, str)

180

assert len(s) == 20

181

182

for idx, idxrec in enumerate(self):

183

if idxrec[I_SHA] == s:

184

return idx

185

else:

186

return _NO_RECORD

187

188

189

190

def _add_compressed(self, text_sha, data, base, compress):

191

# well, maybe compress

192

flags = 0

193

if compress:

194

data_len = len(data)

195

if data_len > 50:

196

# don't do compression if it's too small; it's unlikely to win

197

# enough to be worthwhile

198

compr_data = zlib.compress(data)

199

compr_len = len(compr_data)

200

if compr_len < data_len:

201

data = compr_data

202

flags = FL_GZIP

203

##print '- compressed %d -> %d, %.1f%%' \

204

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

205

return self._add_raw(text_sha, data, base, flags)

206

207

208

209

def _add_raw(self, text_sha, data, base, flags):

210

"""Add pre-processed data, can be either full text or delta.

211

212

This does the compression if that makes sense."""

213

idx = len(self)

214

self.datafile.seek(0, 2) # to end

215

self.idxfile.seek(0, 2)

216

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

217

data_offset = self.datafile.tell()

218

219

assert isinstance(data, str) # not unicode or anything weird

220

221

self.datafile.write(data)

222

self.datafile.flush()

223

224

assert isinstance(text_sha, str)

225

entry = text_sha

226

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

227

assert len(entry) == _RECORDSIZE

228

229

self.idxfile.write(entry)

230

self.idxfile.flush()

231

232

return idx

233

234

235

236

def _add_full_text(self, text, text_sha, compress):

237

"""Add a full text to the file.

238

239

This is not compressed against any reference version.

240

241

Returns the index for that text."""

242

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

243

244

245

# NOT USED

246

def _choose_base(self, seed, base):

247

while seed & 3 == 3:

248

if base == _NO_RECORD:

249

return _NO_RECORD

250

idxrec = self[base]

251

if idxrec[I_BASE] == _NO_RECORD:

252

return base

253

254

base = idxrec[I_BASE]

255

seed >>= 2

256

257

return base # relative to this full text

258

259

260

261

def _add_delta(self, text, text_sha, base, compress):

262

"""Add a text stored relative to a previous text."""

263

self._check_index(base)

264

265

try:

266

base_text = self.get(base, CHAIN_LIMIT)

267

except LimitHitException:

268

return self._add_full_text(text, text_sha, compress)

269

270

data = mdiff.bdiff(base_text, text)

271

272

273

if True: # paranoid early check for bad diff

274

result = mdiff.bpatch(base_text, data)

275

assert result == text

276

277

278

# If the delta is larger than the text, we might as well just

279

# store the text. (OK, the delta might be more compressible,

280

# but the overhead of applying it probably still makes it

281

# bad, and I don't want to compress both of them to find out.)

282

if len(data) >= len(text):

283

return self._add_full_text(text, text_sha, compress)

284

else:

285

return self._add_compressed(text_sha, data, base, compress)

286

287

288

def add(self, text, base=None, compress=True):

289

"""Add a new text to the revfile.

290

291

If the text is already present them its existing id is

292

returned and the file is not changed.

293

294

If compress is true then gzip compression will be used if it

295

reduces the size.

296

297

If a base index is specified, that text *may* be used for

298

delta compression of the new text. Delta compression will

299

only be used if it would be a size win and if the existing

300

base is not at too long of a delta chain already.

301

"""

302

if base == None:

303

base = _NO_RECORD

304

305

self._check_write()

306

307

text_sha = sha.new(text).digest()

308

309

idx = self.find_sha(text_sha)

310

if idx != _NO_RECORD:

311

# TODO: Optional paranoid mode where we read out that record and make sure

312

# it's the same, in case someone ever breaks SHA-1.

313

return idx # already present

314

315

# base = self._choose_base(ord(text_sha[0]), base)

316

317

if base == _NO_RECORD:

318

return self._add_full_text(text, text_sha, compress)

319

else:

320

return self._add_delta(text, text_sha, base, compress)

321

322

323

324

def get(self, idx, recursion_limit=None):

325

"""Retrieve text of a previous revision.

326

327

If recursion_limit is an integer then walk back at most that

328

many revisions and then raise LimitHitException, indicating

329

that we ought to record a new file text instead of another

330

delta. Don't use this when trying to get out an existing

331

revision."""

332

333

idxrec = self[idx]

334

base = idxrec[I_BASE]

335

if base == _NO_RECORD:

336

text = self._get_full_text(idx, idxrec)

337

else:

338

text = self._get_patched(idx, idxrec, recursion_limit)

339

340

if sha.new(text).digest() != idxrec[I_SHA]:

341

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

342

% (idx, self.basename))

343

344

return text

345

346

347

348

def _get_raw(self, idx, idxrec):

349

flags = idxrec[I_FLAGS]

350

if flags & ~FL_GZIP:

351

raise RevfileError("unsupported index flags %#x on index %d"

352

% (flags, idx))

353

354

l = idxrec[I_LEN]

355

if l == 0:

356

return ''

357

358

self.datafile.seek(idxrec[I_OFFSET])

359

360

data = self.datafile.read(l)

361

if len(data) != l:

362

raise RevfileError("short read %d of %d "

363

"getting text for record %d in %r"

364

% (len(data), l, idx, self.basename))

365

366

if flags & FL_GZIP:

367

data = zlib.decompress(data)

368

369

return data

370

371

372

def _get_full_text(self, idx, idxrec):

373

assert idxrec[I_BASE] == _NO_RECORD

374

375

text = self._get_raw(idx, idxrec)

376

377

return text

378

379

380

def _get_patched(self, idx, idxrec, recursion_limit):

381

base = idxrec[I_BASE]

382

assert base >= 0

383

assert base < idx # no loops!

384

385

if recursion_limit == None:

386

sub_limit = None

387

else:

388

sub_limit = recursion_limit - 1

389

if sub_limit < 0:

390

raise LimitHitException()

391

392

base_text = self.get(base, sub_limit)

393

patch = self._get_raw(idx, idxrec)

394

395

text = mdiff.bpatch(base_text, patch)

396

397

return text

398

399

400

401

def __len__(self):

402

"""Return number of revisions."""

403

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

404

if l % _RECORDSIZE:

405

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

406

if l < _RECORDSIZE:

407

raise RevfileError("no header present in index of %r" % (self.basename))

408

return int(l / _RECORDSIZE) - 1

409

410

411

def __getitem__(self, idx):

412

"""Index by sequence id returns the index field"""

413

## TODO: Can avoid seek if we just moved there...

414

self._seek_index(idx)

415

idxrec = self._read_next_index()

416

if idxrec == None:

417

raise IndexError("no index %d" % idx)

418

else:

419

return idxrec

420

421

422

def _seek_index(self, idx):

423

if idx < 0:

424

raise RevfileError("invalid index %r" % idx)

425

self.idxfile.seek((idx + 1) * _RECORDSIZE)

426

427

428

429

def __iter__(self):

430

"""Read back all index records.

431

432

Do not seek the index file while this is underway!"""

433

## sys.stderr.write(" ** iter called ** \n")

434

self._seek_index(0)

435

while True:

436

idxrec = self._read_next_index()

437

if not idxrec:

438

break

439

yield idxrec

440

441

442

def _read_next_index(self):

443

rec = self.idxfile.read(_RECORDSIZE)

444

if not rec:

445

return None

446

elif len(rec) != _RECORDSIZE:

447

raise RevfileError("short read of %d bytes getting index %d from %r"

448

% (len(rec), idx, self.basename))

449

450

return struct.unpack(">20sIIII12x", rec)

451

452

453

def dump(self, f=sys.stdout):

454

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

455

% tuple('idx sha1 base flags offset len'.split()))

456

f.write('-------- ---------------------------------------- ')

457

f.write('-------- -------- -------- --------\n')

458

459

for i, rec in enumerate(self):

460

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

461

if rec[1] == _NO_RECORD:

462

f.write("(none) ")

463

else:

464

f.write("#%-7d " % rec[1])

465

466

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

467

468

469

def total_text_size(self):

470

"""Return the sum of sizes of all file texts.

471

472

This is how much space they would occupy if they were stored without

473

delta and gzip compression.

474

475

As a side effect this completely validates the Revfile, checking that all

476

texts can be reproduced with the correct SHA-1."""

477

t = 0L

478

for idx in range(len(self)):

479

t += len(self.get(idx))

480

return t

481

482

483

def check(self, pb=None):

484

"""Extract every version and check its hash."""

485

total = len(self)

486

for i in range(total):

487

if pb:

488

pb.update("check revision", i, total)

489

# the get method implicitly checks the SHA-1

490

self.get(i)

491

if pb:

492

pb.clear()

493

494

495

496

def main(argv):

497

try:

498

cmd = argv[1]

499

filename = argv[2]

500

except IndexError:

501

sys.stderr.write("usage: revfile dump REVFILE\n"

502

" revfile add REVFILE < INPUT\n"

503

" revfile add-delta REVFILE BASE < INPUT\n"

504

" revfile add-series REVFILE BASE FILE...\n"

505

" revfile get REVFILE IDX\n"

506

" revfile find-sha REVFILE HEX\n"

507

" revfile total-text-size REVFILE\n"

508

" revfile last REVFILE\n")

509

return 1

510

511

if filename.endswith('.drev') or filename.endswith('.irev'):

512

filename = filename[:-5]

513

514

def rw():

515

return Revfile(filename, 'w')

516

517

def ro():

518

return Revfile(filename, 'r')

519

520

if cmd == 'add':

521

print rw().add(sys.stdin.read())

522

elif cmd == 'add-delta':

523

print rw().add(sys.stdin.read(), int(argv[3]))

524

elif cmd == 'add-series':

525

r = rw()

526

rev = int(argv[3])

527

for fn in argv[4:]:

528

print rev

529

rev = r.add(file(fn).read(), rev)

530

elif cmd == 'dump':

531

ro().dump()

532

elif cmd == 'get':

533

try:

534

idx = int(argv[3])

535

except IndexError:

536

sys.stderr.write("usage: revfile get FILE IDX\n")

537

return 1

538

539

r = ro()

540

541

if idx < 0 or idx >= len(r):

542

sys.stderr.write("invalid index %r\n" % idx)

543

return 1

544

545

sys.stdout.write(r.get(idx))

546

elif cmd == 'find-sha':

547

try:

548

s = unhexlify(argv[3])

549

except IndexError:

550

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

551

return 1

552

553

idx = ro().find_sha(s)

554

if idx == _NO_RECORD:

555

sys.stderr.write("no such record\n")

556

return 1

557

else:

558

print idx

559

elif cmd == 'total-text-size':

560

print ro().total_text_size()

561

elif cmd == 'last':

562

print len(ro())-1

563

elif cmd == 'check':

564

import bzrlib.progress

565

pb = bzrlib.progress.ProgressBar()

566

ro().check(pb)

567

else:

568

sys.stderr.write("unknown command %r\n" % cmd)

569

return 1

570

571

572

if __name__ == '__main__':

573

import sys

574

sys.exit(main(sys.argv) or 0)

Older »