~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-03-09 06:44:53 UTC
Revision ID: mbp@sourcefrog.net-20050309064453-60be0ae479d019b8

store committer's timezone in revision and show
in changelog

files added:
doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

files removed:
.bzrignore

.rsyncexclude

NEWS

TODO

build-api

bzrlib/add.py

bzrlib/info.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/textinv.py

doc/Makefile

doc/default.css

doc/ignore.txt

doc/quotes.txt

elementtree

elementtree/ElementTree.py

elementtree/__init__.py

notes

notes/new-inventory-sample.xml

notes/performance.txt

setup.py

test.sh

testbzr

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files renamed:
bzrlib/commands.py => bzr.py

files modified:
README

bzrlib/__init__.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/diff.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/tests.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/bitkeeper.txt

doc/compared-codeville.txt

doc/darcs.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/purpose.txt

doc/python.txt

doc/random.txt

doc/svk.txt

doc/thanks.txt

doc/todo-from-arch.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

100

101

_HEADER = "bzr revfile v1\n"

102

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

103

_NO_RECORD = 0xFFFFFFFFL

104

105

# fields in the index record

106

I_SHA = 0

107

I_BASE = 1

108

I_FLAGS = 2

109

I_OFFSET = 3

110

I_LEN = 4

111

112

FL_GZIP = 1

113

114

# maximum number of patches in a row before recording a whole text.

115

CHAIN_LIMIT = 50

116

117

118

class RevfileError(Exception):

119

pass

120

121

class LimitHitException(Exception):

122

pass

123

124

class Revfile:

125

def __init__(self, basename, mode):

126

# TODO: Lock file while open

127

128

# TODO: advise of random access

129

130

self.basename = basename

131

132

if mode not in ['r', 'w']:

133

raise RevfileError("invalid open mode %r" % mode)

134

self.mode = mode

135

136

idxname = basename + '.irev'

137

dataname = basename + '.drev'

138

139

idx_exists = os.path.exists(idxname)

140

data_exists = os.path.exists(dataname)

141

142

if idx_exists != data_exists:

143

raise RevfileError("half-assed revfile")

144

145

if not idx_exists:

146

if mode == 'r':

147

raise RevfileError("Revfile %r does not exist" % basename)

148

149

self.idxfile = open(idxname, 'w+b')

150

self.datafile = open(dataname, 'w+b')

151

152

print 'init empty file'

153

self.idxfile.write(_HEADER)

154

self.idxfile.flush()

155

else:

156

if mode == 'r':

157

diskmode = 'rb'

158

else:

159

diskmode = 'r+b'

160

161

self.idxfile = open(idxname, diskmode)

162

self.datafile = open(dataname, diskmode)

163

164

h = self.idxfile.read(_RECORDSIZE)

165

if h != _HEADER:

166

raise RevfileError("bad header %r in index of %r"

167

% (h, self.basename))

168

169

170

def _check_index(self, idx):

171

if idx < 0 or idx > len(self):

172

raise RevfileError("invalid index %r" % idx)

173

174

def _check_write(self):

175

if self.mode != 'w':

176

raise RevfileError("%r is open readonly" % self.basename)

177

178

179

def find_sha(self, s):

180

assert isinstance(s, str)

181

assert len(s) == 20

182

183

for idx, idxrec in enumerate(self):

184

if idxrec[I_SHA] == s:

185

return idx

186

else:

187

return _NO_RECORD

188

189

190

191

def _add_compressed(self, text_sha, data, base, compress):

192

# well, maybe compress

193

flags = 0

194

if compress:

195

data_len = len(data)

196

if data_len > 50:

197

# don't do compression if it's too small; it's unlikely to win

198

# enough to be worthwhile

199

compr_data = zlib.compress(data)

200

compr_len = len(compr_data)

201

if compr_len < data_len:

202

data = compr_data

203

flags = FL_GZIP

204

##print '- compressed %d -> %d, %.1f%%' \

205

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

206

return self._add_raw(text_sha, data, base, flags)

207

208

209

210

def _add_raw(self, text_sha, data, base, flags):

211

"""Add pre-processed data, can be either full text or delta.

212

213

This does the compression if that makes sense."""

214

idx = len(self)

215

self.datafile.seek(0, 2) # to end

216

self.idxfile.seek(0, 2)

217

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

218

data_offset = self.datafile.tell()

219

220

assert isinstance(data, str) # not unicode or anything weird

221

222

self.datafile.write(data)

223

self.datafile.flush()

224

225

assert isinstance(text_sha, str)

226

entry = text_sha

227

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

228

assert len(entry) == _RECORDSIZE

229

230

self.idxfile.write(entry)

231

self.idxfile.flush()

232

233

return idx

234

235

236

237

def _add_full_text(self, text, text_sha, compress):

238

"""Add a full text to the file.

239

240

This is not compressed against any reference version.

241

242

Returns the index for that text."""

243

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

244

245

246

def _add_delta(self, text, text_sha, base, compress):

247

"""Add a text stored relative to a previous text."""

248

self._check_index(base)

249

250

try:

251

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

252

except LimitHitException:

253

return self._add_full_text(text, text_sha, compress)

254

255

data = mdiff.bdiff(base_text, text)

256

257

# If the delta is larger than the text, we might as well just

258

# store the text. (OK, the delta might be more compressible,

259

# but the overhead of applying it probably still makes it

260

# bad, and I don't want to compress both of them to find out.)

261

if len(data) >= len(text):

262

return self._add_full_text(text, text_sha, compress)

263

else:

264

return self._add_compressed(text_sha, data, base, compress)

265

266

267

def add(self, text, base=_NO_RECORD, compress=True):

268

"""Add a new text to the revfile.

269

270

If the text is already present them its existing id is

271

returned and the file is not changed.

272

273

If compress is true then gzip compression will be used if it

274

reduces the size.

275

276

If a base index is specified, that text *may* be used for

277

delta compression of the new text. Delta compression will

278

only be used if it would be a size win and if the existing

279

base is not at too long of a delta chain already.

280

"""

281

self._check_write()

282

283

text_sha = sha.new(text).digest()

284

285

idx = self.find_sha(text_sha)

286

if idx != _NO_RECORD:

287

# TODO: Optional paranoid mode where we read out that record and make sure

288

# it's the same, in case someone ever breaks SHA-1.

289

return idx # already present

290

291

if base == _NO_RECORD:

292

return self._add_full_text(text, text_sha, compress)

293

else:

294

return self._add_delta(text, text_sha, base, compress)

295

296

297

298

def get(self, idx, recursion_limit=None):

299

"""Retrieve text of a previous revision.

300

301

If recursion_limit is an integer then walk back at most that

302

many revisions and then raise LimitHitException, indicating

303

that we ought to record a new file text instead of another

304

delta. Don't use this when trying to get out an existing

305

revision."""

306

307

idxrec = self[idx]

308

base = idxrec[I_BASE]

309

if base == _NO_RECORD:

310

text = self._get_full_text(idx, idxrec)

311

else:

312

text = self._get_patched(idx, idxrec, recursion_limit)

313

314

if sha.new(text).digest() != idxrec[I_SHA]:

315

raise RevfileError("corrupt SHA-1 digest on record %d"

316

% idx)

317

318

return text

319

320

321

322

def _get_raw(self, idx, idxrec):

323

flags = idxrec[I_FLAGS]

324

if flags & ~FL_GZIP:

325

raise RevfileError("unsupported index flags %#x on index %d"

326

% (flags, idx))

327

328

l = idxrec[I_LEN]

329

if l == 0:

330

return ''

331

332

self.datafile.seek(idxrec[I_OFFSET])

333

334

data = self.datafile.read(l)

335

if len(data) != l:

336

raise RevfileError("short read %d of %d "

337

"getting text for record %d in %r"

338

% (len(data), l, idx, self.basename))

339

340

if flags & FL_GZIP:

341

data = zlib.decompress(data)

342

343

return data

344

345

346

def _get_full_text(self, idx, idxrec):

347

assert idxrec[I_BASE] == _NO_RECORD

348

349

text = self._get_raw(idx, idxrec)

350

351

return text

352

353

354

def _get_patched(self, idx, idxrec, recursion_limit):

355

base = idxrec[I_BASE]

356

assert base >= 0

357

assert base < idx # no loops!

358

359

if recursion_limit == None:

360

sub_limit = None

361

else:

362

sub_limit = recursion_limit - 1

363

if sub_limit < 0:

364

raise LimitHitException()

365

366

base_text = self.get(base, sub_limit)

367

patch = self._get_raw(idx, idxrec)

368

369

text = mdiff.bpatch(base_text, patch)

370

371

return text

372

373

374

375

def __len__(self):

376

"""Return number of revisions."""

377

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

378

if l % _RECORDSIZE:

379

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

380

if l < _RECORDSIZE:

381

raise RevfileError("no header present in index of %r" % (self.basename))

382

return int(l / _RECORDSIZE) - 1

383

384

385

def __getitem__(self, idx):

386

"""Index by sequence id returns the index field"""

387

## TODO: Can avoid seek if we just moved there...

388

self._seek_index(idx)

389

idxrec = self._read_next_index()

390

if idxrec == None:

391

raise IndexError()

392

else:

393

return idxrec

394

395

396

def _seek_index(self, idx):

397

if idx < 0:

398

raise RevfileError("invalid index %r" % idx)

399

self.idxfile.seek((idx + 1) * _RECORDSIZE)

400

401

402

403

def __iter__(self):

404

"""Read back all index records.

405

406

Do not seek the index file while this is underway!"""

407

sys.stderr.write(" ** iter called ** \n")

408

self._seek_index(0)

409

while True:

410

idxrec = self._read_next_index()

411

if not idxrec:

412

break

413

yield idxrec

414

415

416

def _read_next_index(self):

417

rec = self.idxfile.read(_RECORDSIZE)

418

if not rec:

419

return None

420

elif len(rec) != _RECORDSIZE:

421

raise RevfileError("short read of %d bytes getting index %d from %r"

422

% (len(rec), idx, self.basename))

423

424

return struct.unpack(">20sIIII12x", rec)

425

426

427

def dump(self, f=sys.stdout):

428

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

429

% tuple('idx sha1 base flags offset len'.split()))

430

f.write('-------- ---------------------------------------- ')

431

f.write('-------- -------- -------- --------\n')

432

433

for i, rec in enumerate(self):

434

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

435

if rec[1] == _NO_RECORD:

436

f.write("(none) ")

437

else:

438

f.write("#%-7d " % rec[1])

439

440

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

441

442

443

def total_text_size(self):

444

"""Return the sum of sizes of all file texts.

445

446

This is how much space they would occupy if they were stored without

447

delta and gzip compression.

448

449

As a side effect this completely validates the Revfile, checking that all

450

texts can be reproduced with the correct SHA-1."""

451

t = 0L

452

for idx in range(len(self)):

453

t += len(self.get(idx))

454

return t

455

456

457

458

def main(argv):

459

try:

460

cmd = argv[1]

461

except IndexError:

462

sys.stderr.write("usage: revfile dump\n"

463

" revfile add\n"

464

" revfile add-delta BASE\n"

465

" revfile get IDX\n"

466

" revfile find-sha HEX\n"

467

" revfile total-text-size\n"

468

" revfile last\n")

469

return 1

470

471

def rw():

472

return Revfile('testrev', 'w')

473

474

def ro():

475

return Revfile('testrev', 'r')

476

477

if cmd == 'add':

478

print rw().add(sys.stdin.read())

479

elif cmd == 'add-delta':

480

print rw().add(sys.stdin.read(), int(argv[2]))

481

elif cmd == 'dump':

482

ro().dump()

483

elif cmd == 'get':

484

try:

485

idx = int(argv[2])

486

except IndexError:

487

sys.stderr.write("usage: revfile get IDX\n")

488

return 1

489

490

if idx < 0 or idx >= len(r):

491

sys.stderr.write("invalid index %r\n" % idx)

492

return 1

493

494

sys.stdout.write(ro().get(idx))

495

elif cmd == 'find-sha':

496

try:

497

s = unhexlify(argv[2])

498

except IndexError:

499

sys.stderr.write("usage: revfile find-sha HEX\n")

500

return 1

501

502

idx = ro().find_sha(s)

503

if idx == _NO_RECORD:

504

sys.stderr.write("no such record\n")

505

return 1

506

else:

507

print idx

508

elif cmd == 'total-text-size':

509

print ro().total_text_size()

510

elif cmd == 'last':

511

print len(ro())-1

512

else:

513

sys.stderr.write("unknown command %r\n" % cmd)

514

return 1

515

516

517

if __name__ == '__main__':

518

import sys

519

sys.exit(main(sys.argv) or 0)

Older »