~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-03-23 06:25:55 UTC
Revision ID: mbp@sourcefrog.net-20050323062555-5489339018d0c043

- import a subset of elementtree for easier installation

files added:
doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

files removed:
.rsyncexclude

TODO

bzrlib/cache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/status.py

bzrlib/textinv.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/zsh

contrib/zsh/_bzr

doc/ignore.txt

doc/quotes.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/switch-in-branch.txt

notes/new-inventory-sample.xml

testbzr

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files modified:
.bzrignore

NEWS

README

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/tests.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

doc/Makefile

doc/bitkeeper.txt

doc/darcs.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/python.txt

doc/random.txt

doc/svk.txt

doc/todo-from-arch.txt

elementtree/ElementTree.py

notes/performance.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

_NO_RECORD = 0xFFFFFFFFL

# fields in the index record

I_SHA = 0

I_BASE = 1

I_FLAGS = 2

I_OFFSET = 3

I_LEN = 4

FL_GZIP = 1

# maximum number of patches in a row before recording a whole text.

CHAIN_LIMIT = 50

100

101

102

class RevfileError(Exception):

103

pass

104

105

class LimitHitException(Exception):

106

pass

107

108

class Revfile:

109

def __init__(self, basename, mode):

110

# TODO: Lock file while open

111

112

# TODO: advise of random access

113

114

self.basename = basename

115

116

if mode not in ['r', 'w']:

117

raise RevfileError("invalid open mode %r" % mode)

118

self.mode = mode

119

120

idxname = basename + '.irev'

121

dataname = basename + '.drev'

122

123

idx_exists = os.path.exists(idxname)

124

data_exists = os.path.exists(dataname)

125

126

if idx_exists != data_exists:

127

raise RevfileError("half-assed revfile")

128

129

if not idx_exists:

130

if mode == 'r':

131

raise RevfileError("Revfile %r does not exist" % basename)

132

133

self.idxfile = open(idxname, 'w+b')

134

self.datafile = open(dataname, 'w+b')

135

136

print 'init empty file'

137

self.idxfile.write(_HEADER)

138

self.idxfile.flush()

139

else:

140

if mode == 'r':

141

diskmode = 'rb'

142

else:

143

diskmode = 'r+b'

144

145

self.idxfile = open(idxname, diskmode)

146

self.datafile = open(dataname, diskmode)

147

148

h = self.idxfile.read(_RECORDSIZE)

149

if h != _HEADER:

150

raise RevfileError("bad header %r in index of %r"

151

% (h, self.basename))

152

153

154

def _check_index(self, idx):

155

if idx < 0 or idx > len(self):

156

raise RevfileError("invalid index %r" % idx)

157

158

def _check_write(self):

159

if self.mode != 'w':

160

raise RevfileError("%r is open readonly" % self.basename)

161

162

163

def find_sha(self, s):

164

assert isinstance(s, str)

165

assert len(s) == 20

166

167

for idx, idxrec in enumerate(self):

168

if idxrec[I_SHA] == s:

169

return idx

170

else:

171

return _NO_RECORD

172

173

174

175

def _add_compressed(self, text_sha, data, base, compress):

176

# well, maybe compress

177

flags = 0

178

if compress:

179

data_len = len(data)

180

if data_len > 50:

181

# don't do compression if it's too small; it's unlikely to win

182

# enough to be worthwhile

183

compr_data = zlib.compress(data)

184

compr_len = len(compr_data)

185

if compr_len < data_len:

186

data = compr_data

187

flags = FL_GZIP

188

##print '- compressed %d -> %d, %.1f%%' \

189

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

190

return self._add_raw(text_sha, data, base, flags)

191

192

193

194

def _add_raw(self, text_sha, data, base, flags):

195

"""Add pre-processed data, can be either full text or delta.

196

197

This does the compression if that makes sense."""

198

idx = len(self)

199

self.datafile.seek(0, 2) # to end

200

self.idxfile.seek(0, 2)

201

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

202

data_offset = self.datafile.tell()

203

204

assert isinstance(data, str) # not unicode or anything weird

205

206

self.datafile.write(data)

207

self.datafile.flush()

208

209

assert isinstance(text_sha, str)

210

entry = text_sha

211

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

212

assert len(entry) == _RECORDSIZE

213

214

self.idxfile.write(entry)

215

self.idxfile.flush()

216

217

return idx

218

219

220

221

def _add_full_text(self, text, text_sha, compress):

222

"""Add a full text to the file.

223

224

This is not compressed against any reference version.

225

226

Returns the index for that text."""

227

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

228

229

230

def _add_delta(self, text, text_sha, base, compress):

231

"""Add a text stored relative to a previous text."""

232

self._check_index(base)

233

234

try:

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

236

except LimitHitException:

237

return self._add_full_text(text, text_sha, compress)

238

239

data = mdiff.bdiff(base_text, text)

240

241

# If the delta is larger than the text, we might as well just

242

# store the text. (OK, the delta might be more compressible,

243

# but the overhead of applying it probably still makes it

244

# bad, and I don't want to compress both of them to find out.)

245

if len(data) >= len(text):

246

return self._add_full_text(text, text_sha, compress)

247

else:

248

return self._add_compressed(text_sha, data, base, compress)

249

250

251

def add(self, text, base=_NO_RECORD, compress=True):

252

"""Add a new text to the revfile.

253

254

If the text is already present them its existing id is

255

returned and the file is not changed.

256

257

If compress is true then gzip compression will be used if it

258

reduces the size.

259

260

If a base index is specified, that text *may* be used for

261

delta compression of the new text. Delta compression will

262

only be used if it would be a size win and if the existing

263

base is not at too long of a delta chain already.

264

"""

265

self._check_write()

266

267

text_sha = sha.new(text).digest()

268

269

idx = self.find_sha(text_sha)

270

if idx != _NO_RECORD:

271

# TODO: Optional paranoid mode where we read out that record and make sure

272

# it's the same, in case someone ever breaks SHA-1.

273

return idx # already present

274

275

if base == _NO_RECORD:

276

return self._add_full_text(text, text_sha, compress)

277

else:

278

return self._add_delta(text, text_sha, base, compress)

279

280

281

282

def get(self, idx, recursion_limit=None):

283

"""Retrieve text of a previous revision.

284

285

If recursion_limit is an integer then walk back at most that

286

many revisions and then raise LimitHitException, indicating

287

that we ought to record a new file text instead of another

288

delta. Don't use this when trying to get out an existing

289

revision."""

290

291

idxrec = self[idx]

292

base = idxrec[I_BASE]

293

if base == _NO_RECORD:

294

text = self._get_full_text(idx, idxrec)

295

else:

296

text = self._get_patched(idx, idxrec, recursion_limit)

297

298

if sha.new(text).digest() != idxrec[I_SHA]:

299

raise RevfileError("corrupt SHA-1 digest on record %d"

300

% idx)

301

302

return text

303

304

305

306

def _get_raw(self, idx, idxrec):

307

flags = idxrec[I_FLAGS]

308

if flags & ~FL_GZIP:

309

raise RevfileError("unsupported index flags %#x on index %d"

310

% (flags, idx))

311

312

l = idxrec[I_LEN]

313

if l == 0:

314

return ''

315

316

self.datafile.seek(idxrec[I_OFFSET])

317

318

data = self.datafile.read(l)

319

if len(data) != l:

320

raise RevfileError("short read %d of %d "

321

"getting text for record %d in %r"

322

% (len(data), l, idx, self.basename))

323

324

if flags & FL_GZIP:

325

data = zlib.decompress(data)

326

327

return data

328

329

330

def _get_full_text(self, idx, idxrec):

331

assert idxrec[I_BASE] == _NO_RECORD

332

333

text = self._get_raw(idx, idxrec)

334

335

return text

336

337

338

def _get_patched(self, idx, idxrec, recursion_limit):

339

base = idxrec[I_BASE]

340

assert base >= 0

341

assert base < idx # no loops!

342

343

if recursion_limit == None:

344

sub_limit = None

345

else:

346

sub_limit = recursion_limit - 1

347

if sub_limit < 0:

348

raise LimitHitException()

349

350

base_text = self.get(base, sub_limit)

351

patch = self._get_raw(idx, idxrec)

352

353

text = mdiff.bpatch(base_text, patch)

354

355

return text

356

357

358

359

def __len__(self):

360

"""Return number of revisions."""

361

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

362

if l % _RECORDSIZE:

363

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

364

if l < _RECORDSIZE:

365

raise RevfileError("no header present in index of %r" % (self.basename))

366

return int(l / _RECORDSIZE) - 1

367

368

369

def __getitem__(self, idx):

370

"""Index by sequence id returns the index field"""

371

## TODO: Can avoid seek if we just moved there...

372

self._seek_index(idx)

373

idxrec = self._read_next_index()

374

if idxrec == None:

375

raise IndexError()

376

else:

377

return idxrec

378

379

380

def _seek_index(self, idx):

381

if idx < 0:

382

raise RevfileError("invalid index %r" % idx)

383

self.idxfile.seek((idx + 1) * _RECORDSIZE)

384

385

386

387

def __iter__(self):

388

"""Read back all index records.

389

390

Do not seek the index file while this is underway!"""

391

sys.stderr.write(" ** iter called ** \n")

392

self._seek_index(0)

393

while True:

394

idxrec = self._read_next_index()

395

if not idxrec:

396

break

397

yield idxrec

398

399

400

def _read_next_index(self):

401

rec = self.idxfile.read(_RECORDSIZE)

402

if not rec:

403

return None

404

elif len(rec) != _RECORDSIZE:

405

raise RevfileError("short read of %d bytes getting index %d from %r"

406

% (len(rec), idx, self.basename))

407

408

return struct.unpack(">20sIIII12x", rec)

409

410

411

def dump(self, f=sys.stdout):

412

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

413

% tuple('idx sha1 base flags offset len'.split()))

414

f.write('-------- ---------------------------------------- ')

415

f.write('-------- -------- -------- --------\n')

416

417

for i, rec in enumerate(self):

418

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

419

if rec[1] == _NO_RECORD:

420

f.write("(none) ")

421

else:

422

f.write("#%-7d " % rec[1])

423

424

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

425

426

427

def total_text_size(self):

428

"""Return the sum of sizes of all file texts.

429

430

This is how much space they would occupy if they were stored without

431

delta and gzip compression.

432

433

As a side effect this completely validates the Revfile, checking that all

434

texts can be reproduced with the correct SHA-1."""

435

t = 0L

436

for idx in range(len(self)):

437

t += len(self.get(idx))

438

return t

439

440

441

442

def main(argv):

443

try:

444

cmd = argv[1]

445

except IndexError:

446

sys.stderr.write("usage: revfile dump\n"

447

" revfile add\n"

448

" revfile add-delta BASE\n"

449

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

453

return 1

454

455

def rw():

456

return Revfile('testrev', 'w')

457

458

def ro():

459

return Revfile('testrev', 'r')

460

461

if cmd == 'add':

462

print rw().add(sys.stdin.read())

463

elif cmd == 'add-delta':

464

print rw().add(sys.stdin.read(), int(argv[2]))

465

elif cmd == 'dump':

466

ro().dump()

467

elif cmd == 'get':

468

try:

469

idx = int(argv[2])

470

except IndexError:

471

sys.stderr.write("usage: revfile get IDX\n")

472

return 1

473

474

if idx < 0 or idx >= len(r):

475

sys.stderr.write("invalid index %r\n" % idx)

476

return 1

477

478

sys.stdout.write(ro().get(idx))

479

elif cmd == 'find-sha':

480

try:

481

s = unhexlify(argv[2])

482

except IndexError:

483

sys.stderr.write("usage: revfile find-sha HEX\n")

484

return 1

485

486

idx = ro().find_sha(s)

487

if idx == _NO_RECORD:

488

sys.stderr.write("no such record\n")

489

return 1

490

else:

491

print idx

492

elif cmd == 'total-text-size':

493

print ro().total_text_size()

494

elif cmd == 'last':

495

print len(ro())-1

496

else:

497

sys.stderr.write("unknown command %r\n" % cmd)

498

return 1

499

500

501

if __name__ == '__main__':

502

import sys

503

sys.exit(main(sys.argv) or 0)

Older »