~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Martin Pool
Date: 2005-08-12 15:41:44 UTC
Revision ID: mbp@sourcefrog.net-20050812154144-bc98570a78b8f633

- merge in deferred revfile work

files added:
notes/revfile.txt

files modified:
bzrlib/mdiff.py

bzrlib/revfile.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

The iter method here will generally read through the whole index file

in one go. With readahead in the kernel and python/libc (typically

128kB) this means that there should be no seeks and often only one

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

110

FL_GZIP = 1

111

112

# maximum number of patches in a row before recording a whole text.

CHAIN_LIMIT = 50

113

CHAIN_LIMIT = 25

100

114

101

115

102

116

class RevfileError(Exception):

227

241

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

228

242

229

243

244

# NOT USED

245

def _choose_base(self, seed, base):

246

while seed & 3 == 3:

247

if base == _NO_RECORD:

248

return _NO_RECORD

249

idxrec = self[base]

250

if idxrec[I_BASE] == _NO_RECORD:

251

return base

252

253

base = idxrec[I_BASE]

254

seed >>= 2

255

256

return base # relative to this full text

257

258

259

230

260

def _add_delta(self, text, text_sha, base, compress):

231

261

"""Add a text stored relative to a previous text."""

232

262

self._check_index(base)

233

263

234

264

try:

235

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

265

base_text = self.get(base, CHAIN_LIMIT)

236

266

except LimitHitException:

237

267

return self._add_full_text(text, text_sha, compress)

238

268

272

302

# it's the same, in case someone ever breaks SHA-1.

273

303

return idx # already present

274

304

305

# base = self._choose_base(ord(text_sha[0]), base)

306

275

307

if base == _NO_RECORD:

276

308

return self._add_full_text(text, text_sha, compress)

277

309

else:

372

404

self._seek_index(idx)

373

405

idxrec = self._read_next_index()

374

406

if idxrec == None:

375

raise IndexError()

407

raise IndexError("no index %d" % idx)

376

408

else:

377

409

return idxrec

378

410

388

420

"""Read back all index records.

389

421

390

422

Do not seek the index file while this is underway!"""

391

sys.stderr.write(" ** iter called ** \n")

423

## sys.stderr.write(" ** iter called ** \n")

392

424

self._seek_index(0)

393

425

while True:

394

426

idxrec = self._read_next_index()

442

474

def main(argv):

443

475

try:

444

476

cmd = argv[1]

477

filename = argv[2]

445

478

except IndexError:

446

sys.stderr.write("usage: revfile dump\n"

447

" revfile add\n"

448

" revfile add-delta BASE\n"

449

" revfile get IDX\n"

450

" revfile find-sha HEX\n"

451

" revfile total-text-size\n"

452

" revfile last\n")

479

sys.stderr.write("usage: revfile dump REVFILE\n"

480

" revfile add REVFILE < INPUT\n"

481

" revfile add-delta REVFILE BASE < INPUT\n"

482

" revfile add-series REVFILE BASE FILE...\n"

483

" revfile get REVFILE IDX\n"

484

" revfile find-sha REVFILE HEX\n"

485

" revfile total-text-size REVFILE\n"

486

" revfile last REVFILE\n")

453

487

return 1

454

488

455

489

def rw():

456

return Revfile('testrev', 'w')

490

return Revfile(filename, 'w')

457

491

458

492

def ro():

459

return Revfile('testrev', 'r')

493

return Revfile(filename, 'r')

460

494

461

495

if cmd == 'add':

462

496

print rw().add(sys.stdin.read())

463

497

elif cmd == 'add-delta':

464

print rw().add(sys.stdin.read(), int(argv[2]))

498

print rw().add(sys.stdin.read(), int(argv[3]))

499

elif cmd == 'add-series':

500

r = rw()

501

rev = int(argv[3])

502

for fn in argv[4:]:

503

print rev

504

rev = r.add(file(fn).read(), rev)

465

505

elif cmd == 'dump':

466

506

ro().dump()

467

507

elif cmd == 'get':

468

508

try:

469

idx = int(argv[2])

509

idx = int(argv[3])

470

510

except IndexError:

471

sys.stderr.write("usage: revfile get IDX\n")

511

sys.stderr.write("usage: revfile get FILE IDX\n")

472

512

return 1

473

513

514

r = ro()

515

474

516

if idx < 0 or idx >= len(r):

475

517

sys.stderr.write("invalid index %r\n" % idx)

476

518

return 1

477

519

478

sys.stdout.write(ro().get(idx))

520

sys.stdout.write(r.get(idx))

479

521

elif cmd == 'find-sha':

480

522

try:

481

s = unhexlify(argv[2])

523

s = unhexlify(argv[3])

482

524

except IndexError:

483

sys.stderr.write("usage: revfile find-sha HEX\n")

525

sys.stderr.write("usage: revfile find-sha FILE HEX\n")

484

526

return 1

485

527

486

528

idx = ro().find_sha(s)

Older »