~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Aaron Bentley
Date: 2005-11-17 04:45:29 UTC
mfrom: (1185.33.17 bzr.dev)
mto: (1185.65.14 storage)
mto: This revision was merged to the branch mainline in revision 1550.
Revision ID: aaron.bentley@utoronto.ca-20051117044529-517ac29466e34b57

Merged from mainline

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

read() call to get everything into memory.

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

# TODO: Shouldn't need to lock if we always write in append mode and

# then ftell after writing to see where it went. In any case we

# assume the whole branch is protected by a lock.

import os

import sha

import stat

import struct

100

import sys

101

import zlib

102

from binascii import hexlify, unhexlify

103

104

import bzrlib.mdiff as mdiff

105

106

107

_RECORDSIZE = 48

108

109

_HEADER = "bzr revfile v1\n"

110

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

111

_NO_RECORD = 0xFFFFFFFFL

112

113

# fields in the index record

114

I_SHA = 0

115

I_BASE = 1

116

I_FLAGS = 2

117

I_OFFSET = 3

118

I_LEN = 4

119

120

FL_GZIP = 1

121

122

# maximum number of patches in a row before recording a whole text.

123

CHAIN_LIMIT = 10

124

125

126

class RevfileError(Exception):

127

pass

128

129

class LimitHitException(Exception):

130

pass

131

132

class Revfile(object):

133

def __init__(self, basename, mode):

134

# TODO: Lock file while open

135

136

# TODO: advise of random access

137

138

self.basename = basename

139

140

if mode not in ['r', 'w']:

141

raise RevfileError("invalid open mode %r" % mode)

142

self.mode = mode

143

144

idxname = basename + '.irev'

145

dataname = basename + '.drev'

146

147

idx_exists = os.path.exists(idxname)

148

data_exists = os.path.exists(dataname)

149

150

if idx_exists != data_exists:

151

raise RevfileError("half-assed revfile")

152

153

if not idx_exists:

154

if mode == 'r':

155

raise RevfileError("Revfile %r does not exist" % basename)

156

157

self.idxfile = open(idxname, 'w+b')

158

self.datafile = open(dataname, 'w+b')

159

160

self.idxfile.write(_HEADER)

161

self.idxfile.flush()

162

else:

163

if mode == 'r':

164

diskmode = 'rb'

165

else:

166

diskmode = 'r+b'

167

168

self.idxfile = open(idxname, diskmode)

169

self.datafile = open(dataname, diskmode)

170

171

h = self.idxfile.read(_RECORDSIZE)

172

if h != _HEADER:

173

raise RevfileError("bad header %r in index of %r"

174

% (h, self.basename))

175

176

177

def _check_index(self, idx):

178

if idx < 0 or idx > len(self):

179

raise RevfileError("invalid index %r" % idx)

180

181

def _check_write(self):

182

if self.mode != 'w':

183

raise RevfileError("%r is open readonly" % self.basename)

184

185

186

def find_sha(self, s):

187

assert isinstance(s, str)

188

assert len(s) == 20

189

190

for idx, idxrec in enumerate(self):

191

if idxrec[I_SHA] == s:

192

return idx

193

else:

194

return _NO_RECORD

195

196

197

198

def _add_compressed(self, text_sha, data, base, compress):

199

# well, maybe compress

200

flags = 0

201

if compress:

202

data_len = len(data)

203

if data_len > 50:

204

# don't do compression if it's too small; it's unlikely to win

205

# enough to be worthwhile

206

compr_data = zlib.compress(data)

207

compr_len = len(compr_data)

208

if compr_len < data_len:

209

data = compr_data

210

flags = FL_GZIP

211

##print '- compressed %d -> %d, %.1f%%' \

212

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

213

return self._add_raw(text_sha, data, base, flags)

214

215

216

217

def _add_raw(self, text_sha, data, base, flags):

218

"""Add pre-processed data, can be either full text or delta.

219

220

This does the compression if that makes sense."""

221

idx = len(self)

222

self.datafile.seek(0, 2) # to end

223

self.idxfile.seek(0, 2)

224

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

225

data_offset = self.datafile.tell()

226

227

assert isinstance(data, str) # not unicode or anything weird

228

229

self.datafile.write(data)

230

self.datafile.flush()

231

232

assert isinstance(text_sha, str)

233

entry = text_sha

234

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

235

assert len(entry) == _RECORDSIZE

236

237

self.idxfile.write(entry)

238

self.idxfile.flush()

239

240

return idx

241

242

243

244

def _add_full_text(self, text, text_sha, compress):

245

"""Add a full text to the file.

246

247

This is not compressed against any reference version.

248

249

Returns the index for that text."""

250

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

251

252

253

# NOT USED

254

def _choose_base(self, seed, base):

255

while seed & 3 == 3:

256

if base == _NO_RECORD:

257

return _NO_RECORD

258

idxrec = self[base]

259

if idxrec[I_BASE] == _NO_RECORD:

260

return base

261

262

base = idxrec[I_BASE]

263

seed >>= 2

264

265

return base # relative to this full text

266

267

268

269

def _add_delta(self, text, text_sha, base, compress):

270

"""Add a text stored relative to a previous text."""

271

self._check_index(base)

272

273

try:

274

base_text = self.get(base, CHAIN_LIMIT)

275

except LimitHitException:

276

return self._add_full_text(text, text_sha, compress)

277

278

data = mdiff.bdiff(base_text, text)

279

280

281

if True: # paranoid early check for bad diff

282

result = mdiff.bpatch(base_text, data)

283

assert result == text

284

285

286

# If the delta is larger than the text, we might as well just

287

# store the text. (OK, the delta might be more compressible,

288

# but the overhead of applying it probably still makes it

289

# bad, and I don't want to compress both of them to find out.)

290

if len(data) >= len(text):

291

return self._add_full_text(text, text_sha, compress)

292

else:

293

return self._add_compressed(text_sha, data, base, compress)

294

295

296

def add(self, text, base=None, compress=True):

297

"""Add a new text to the revfile.

298

299

If the text is already present them its existing id is

300

returned and the file is not changed.

301

302

If compress is true then gzip compression will be used if it

303

reduces the size.

304

305

If a base index is specified, that text *may* be used for

306

delta compression of the new text. Delta compression will

307

only be used if it would be a size win and if the existing

308

base is not at too long of a delta chain already.

309

"""

310

if base == None:

311

base = _NO_RECORD

312

313

self._check_write()

314

315

text_sha = sha.new(text).digest()

316

317

idx = self.find_sha(text_sha)

318

if idx != _NO_RECORD:

319

# TODO: Optional paranoid mode where we read out that record and make sure

320

# it's the same, in case someone ever breaks SHA-1.

321

return idx # already present

322

323

# base = self._choose_base(ord(text_sha[0]), base)

324

325

if base == _NO_RECORD:

326

return self._add_full_text(text, text_sha, compress)

327

else:

328

return self._add_delta(text, text_sha, base, compress)

329

330

331

332

def get(self, idx, recursion_limit=None):

333

"""Retrieve text of a previous revision.

334

335

If recursion_limit is an integer then walk back at most that

336

many revisions and then raise LimitHitException, indicating

337

that we ought to record a new file text instead of another

338

delta. Don't use this when trying to get out an existing

339

revision."""

340

341

idxrec = self[idx]

342

base = idxrec[I_BASE]

343

if base == _NO_RECORD:

344

text = self._get_full_text(idx, idxrec)

345

else:

346

text = self._get_patched(idx, idxrec, recursion_limit)

347

348

if sha.new(text).digest() != idxrec[I_SHA]:

349

raise RevfileError("corrupt SHA-1 digest on record %d in %s"

350

% (idx, self.basename))

351

352

return text

353

354

355

356

def _get_raw(self, idx, idxrec):

357

flags = idxrec[I_FLAGS]

358

if flags & ~FL_GZIP:

359

raise RevfileError("unsupported index flags %#x on index %d"

360

% (flags, idx))

361

362

l = idxrec[I_LEN]

363

if l == 0:

364

return ''

365

366

self.datafile.seek(idxrec[I_OFFSET])

367

368

data = self.datafile.read(l)

369

if len(data) != l:

370

raise RevfileError("short read %d of %d "

371

"getting text for record %d in %r"

372

% (len(data), l, idx, self.basename))

373

374

if flags & FL_GZIP:

375

data = zlib.decompress(data)

376

377

return data

378

379

380

def _get_full_text(self, idx, idxrec):

381

assert idxrec[I_BASE] == _NO_RECORD

382

383

text = self._get_raw(idx, idxrec)

384

385

return text

386

387

388

def _get_patched(self, idx, idxrec, recursion_limit):

389

base = idxrec[I_BASE]

390

assert base >= 0

391

assert base < idx # no loops!

392

393

if recursion_limit == None:

394

sub_limit = None

395

else:

396

sub_limit = recursion_limit - 1

397

if sub_limit < 0:

398

raise LimitHitException()

399

400

base_text = self.get(base, sub_limit)

401

patch = self._get_raw(idx, idxrec)

402

403

text = mdiff.bpatch(base_text, patch)

404

405

return text

406

407

408

409

def __len__(self):

410

"""Return number of revisions."""

411

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

412

if l % _RECORDSIZE:

413

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

414

if l < _RECORDSIZE:

415

raise RevfileError("no header present in index of %r" % (self.basename))

416

return int(l / _RECORDSIZE) - 1

417

418

419

def __getitem__(self, idx):

420

"""Index by sequence id returns the index field"""

421

## TODO: Can avoid seek if we just moved there...

422

self._seek_index(idx)

423

idxrec = self._read_next_index()

424

if idxrec == None:

425

raise IndexError("no index %d" % idx)

426

else:

427

return idxrec

428

429

430

def _seek_index(self, idx):

431

if idx < 0:

432

raise RevfileError("invalid index %r" % idx)

433

self.idxfile.seek((idx + 1) * _RECORDSIZE)

434

435

436

437

def __iter__(self):

438

"""Read back all index records.

439

440

Do not seek the index file while this is underway!"""

441

## sys.stderr.write(" ** iter called ** \n")

442

self._seek_index(0)

443

while True:

444

idxrec = self._read_next_index()

445

if not idxrec:

446

break

447

yield idxrec

448

449

450

def _read_next_index(self):

451

rec = self.idxfile.read(_RECORDSIZE)

452

if not rec:

453

return None

454

elif len(rec) != _RECORDSIZE:

455

raise RevfileError("short read of %d bytes getting index %d from %r"

456

% (len(rec), idx, self.basename))

457

458

return struct.unpack(">20sIIII12x", rec)

459

460

461

def dump(self, f=sys.stdout):

462

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

463

% tuple('idx sha1 base flags offset len'.split()))

464

f.write('-------- ---------------------------------------- ')

465

f.write('-------- -------- -------- --------\n')

466

467

for i, rec in enumerate(self):

468

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

469

if rec[1] == _NO_RECORD:

470

f.write("(none) ")

471

else:

472

f.write("#%-7d " % rec[1])

473

474

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

475

476

477

def total_text_size(self):

478

"""Return the sum of sizes of all file texts.

479

480

This is how much space they would occupy if they were stored without

481

delta and gzip compression.

482

483

As a side effect this completely validates the Revfile, checking that all

484

texts can be reproduced with the correct SHA-1."""

485

t = 0L

486

for idx in range(len(self)):

487

t += len(self.get(idx))

488

return t

489

490

491

def check(self, pb=None):

492

"""Extract every version and check its hash."""

493

total = len(self)

494

for i in range(total):

495

if pb:

496

pb.update("check revision", i, total)

497

# the get method implicitly checks the SHA-1

498

self.get(i)

499

if pb:

500

pb.clear()

501

502

503

504

def main(argv):

505

try:

506

cmd = argv[1]

507

filename = argv[2]

508

except IndexError:

509

sys.stderr.write("usage: revfile dump REVFILE\n"

510

" revfile add REVFILE < INPUT\n"

511

" revfile add-delta REVFILE BASE < INPUT\n"

512

" revfile add-series REVFILE BASE FILE...\n"

513

" revfile get REVFILE IDX\n"

514

" revfile find-sha REVFILE HEX\n"

515

" revfile total-text-size REVFILE\n"

516

" revfile last REVFILE\n")

517

return 1

518

519

if filename.endswith('.drev') or filename.endswith('.irev'):

520

filename = filename[:-5]

521

522

def rw():

523

return Revfile(filename, 'w')

524

525

def ro():

526

return Revfile(filename, 'r')

527

528

if cmd == 'add':

529

print rw().add(sys.stdin.read())

530

elif cmd == 'add-delta':

531

print rw().add(sys.stdin.read(), int(argv[3]))

532

elif cmd == 'add-series':

533

r = rw()

534

rev = int(argv[3])

535

for fn in argv[4:]:

536

print rev

537

rev = r.add(file(fn).read(), rev)

538

elif cmd == 'dump':

539

ro().dump()

540

elif cmd == 'get':

541

try:

542

idx = int(argv[3])

543

except IndexError:

544

sys.stderr.write("usage: revfile get FILE IDX\n")

545

return 1

546

547

r = ro()

548

549

if idx < 0 or idx >= len(r):

550

sys.stderr.write("invalid index %r\n" % idx)

551

return 1

552

553

sys.stdout.write(r.get(idx))

554

elif cmd == 'find-sha':

555

try:

556

s = unhexlify(argv[3])

557

except IndexError:

558

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

559

return 1

560

561

idx = ro().find_sha(s)

562

if idx == _NO_RECORD:

563

sys.stderr.write("no such record\n")

564

return 1

565

else:

566

print idx

567

elif cmd == 'total-text-size':

568

print ro().total_text_size()

569

elif cmd == 'last':

570

print len(ro())-1

571

elif cmd == 'check':

572

import bzrlib.progress

573

pb = bzrlib.progress.ProgressBar()

574

ro().check(pb)

575

else:

576

sys.stderr.write("unknown command %r\n" % cmd)

577

return 1

578

579

580

if __name__ == '__main__':

581

import sys

582

sys.exit(main(sys.argv) or 0)

Older »