~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 06:21:44 UTC
Revision ID: mbp@sourcefrog.net-20050409062144-e47a4b64106e4c21af99beaf

debug output

files added:
.rsyncexclude

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/remotebranch.py

bzrlib/revfile.py

doc/quotes.txt

notes/new-inventory-sample.xml

test.sh

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files modified:
.bzrignore

NEWS

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/tests.py

bzrlib/trace.py

bzrlib/tree.py

doc/random.txt

doc/svk.txt

notes/performance.txt

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

_NO_RECORD = 0xFFFFFFFFL

100

# fields in the index record

101

I_SHA = 0

102

I_BASE = 1

103

I_FLAGS = 2

104

I_OFFSET = 3

105

I_LEN = 4

106

107

FL_GZIP = 1

108

109

# maximum number of patches in a row before recording a whole text.

110

# intentionally pretty low for testing purposes.

111

CHAIN_LIMIT = 2

112

113

114

class RevfileError(Exception):

115

pass

116

117

class LimitHitException(Exception):

118

pass

119

120

class Revfile:

121

def __init__(self, basename):

122

# TODO: Option to open readonly

123

124

# TODO: Lock file while open

125

126

# TODO: advise of random access

127

128

self.basename = basename

129

130

idxname = basename + '.irev'

131

dataname = basename + '.drev'

132

133

idx_exists = os.path.exists(idxname)

134

data_exists = os.path.exists(dataname)

135

136

if idx_exists != data_exists:

137

raise RevfileError("half-assed revfile")

138

139

if not idx_exists:

140

self.idxfile = open(idxname, 'w+b')

141

self.datafile = open(dataname, 'w+b')

142

143

print 'init empty file'

144

self.idxfile.write(_HEADER)

145

self.idxfile.flush()

146

else:

147

self.idxfile = open(idxname, 'r+b')

148

self.datafile = open(dataname, 'r+b')

149

150

h = self.idxfile.read(_RECORDSIZE)

151

if h != _HEADER:

152

raise RevfileError("bad header %r in index of %r"

153

% (h, self.basename))

154

155

156

def _check_index(self, idx):

157

if idx < 0 or idx > len(self):

158

raise RevfileError("invalid index %r" % idx)

159

160

161

def find_sha(self, s):

162

assert isinstance(s, str)

163

assert len(s) == 20

164

165

for idx, idxrec in enumerate(self):

166

if idxrec[I_SHA] == s:

167

return idx

168

else:

169

return _NO_RECORD

170

171

172

173

def _add_compressed(self, text_sha, data, base, compress):

174

# well, maybe compress

175

flags = 0

176

if compress:

177

data_len = len(data)

178

if data_len > 50:

179

# don't do compression if it's too small; it's unlikely to win

180

# enough to be worthwhile

181

compr_data = zlib.compress(data)

182

compr_len = len(compr_data)

183

if compr_len < data_len:

184

data = compr_data

185

flags = FL_GZIP

186

##print '- compressed %d -> %d, %.1f%%' \

187

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

188

return self._add_raw(text_sha, data, base, flags)

189

190

191

192

def _add_raw(self, text_sha, data, base, flags):

193

"""Add pre-processed data, can be either full text or delta.

194

195

This does the compression if that makes sense."""

196

idx = len(self)

197

self.datafile.seek(0, 2) # to end

198

self.idxfile.seek(0, 2)

199

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

200

data_offset = self.datafile.tell()

201

202

assert isinstance(data, str) # not unicode or anything wierd

203

204

self.datafile.write(data)

205

self.datafile.flush()

206

207

assert isinstance(text_sha, str)

208

entry = text_sha

209

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

210

assert len(entry) == _RECORDSIZE

211

212

self.idxfile.write(entry)

213

self.idxfile.flush()

214

215

return idx

216

217

218

219

def _add_full_text(self, text, text_sha, compress):

220

"""Add a full text to the file.

221

222

This is not compressed against any reference version.

223

224

Returns the index for that text."""

225

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

226

227

228

def _add_delta(self, text, text_sha, base, compress):

229

"""Add a text stored relative to a previous text."""

230

self._check_index(base)

231

232

try:

233

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

234

except LimitHitException:

235

return self._add_full_text(text, text_sha, compress)

236

237

data = mdiff.bdiff(base_text, text)

238

239

# If the delta is larger than the text, we might as well just

240

# store the text. (OK, the delta might be more compressible,

241

# but the overhead of applying it probably still makes it

242

# bad, and I don't want to compress both of them to find out.)

243

if len(data) >= len(text):

244

return self._add_full_text(text, text_sha, compress)

245

else:

246

return self._add_compressed(text_sha, data, base, compress)

247

248

249

def add(self, text, base=_NO_RECORD, compress=True):

250

"""Add a new text to the revfile.

251

252

If the text is already present them its existing id is

253

returned and the file is not changed.

254

255

If compress is true then gzip compression will be used if it

256

reduces the size.

257

258

If a base index is specified, that text *may* be used for

259

delta compression of the new text. Delta compression will

260

only be used if it would be a size win and if the existing

261

base is not at too long of a delta chain already.

262

"""

263

text_sha = sha.new(text).digest()

264

265

idx = self.find_sha(text_sha)

266

if idx != _NO_RECORD:

267

# TODO: Optional paranoid mode where we read out that record and make sure

268

# it's the same, in case someone ever breaks SHA-1.

269

return idx # already present

270

271

if base == _NO_RECORD:

272

return self._add_full_text(text, text_sha, compress)

273

else:

274

return self._add_delta(text, text_sha, base, compress)

275

276

277

278

def get(self, idx, recursion_limit=None):

279

"""Retrieve text of a previous revision.

280

281

If recursion_limit is an integer then walk back at most that

282

many revisions and then raise LimitHitException, indicating

283

that we ought to record a new file text instead of another

284

delta. Don't use this when trying to get out an existing

285

revision."""

286

287

idxrec = self[idx]

288

base = idxrec[I_BASE]

289

if base == _NO_RECORD:

290

text = self._get_full_text(idx, idxrec)

291

else:

292

text = self._get_patched(idx, idxrec, recursion_limit)

293

294

if sha.new(text).digest() != idxrec[I_SHA]:

295

raise RevfileError("corrupt SHA-1 digest on record %d"

296

% idx)

297

298

return text

299

300

301

302

def _get_raw(self, idx, idxrec):

303

flags = idxrec[I_FLAGS]

304

if flags & ~FL_GZIP:

305

raise RevfileError("unsupported index flags %#x on index %d"

306

% (flags, idx))

307

308

l = idxrec[I_LEN]

309

if l == 0:

310

return ''

311

312

self.datafile.seek(idxrec[I_OFFSET])

313

314

data = self.datafile.read(l)

315

if len(data) != l:

316

raise RevfileError("short read %d of %d "

317

"getting text for record %d in %r"

318

% (len(data), l, idx, self.basename))

319

320

if flags & FL_GZIP:

321

data = zlib.decompress(data)

322

323

return data

324

325

326

def _get_full_text(self, idx, idxrec):

327

assert idxrec[I_BASE] == _NO_RECORD

328

329

text = self._get_raw(idx, idxrec)

330

331

return text

332

333

334

def _get_patched(self, idx, idxrec, recursion_limit):

335

base = idxrec[I_BASE]

336

assert base >= 0

337

assert base < idx # no loops!

338

339

if recursion_limit == None:

340

sub_limit = None

341

else:

342

sub_limit = recursion_limit - 1

343

if sub_limit < 0:

344

raise LimitHitException()

345

346

base_text = self.get(base, sub_limit)

347

patch = self._get_raw(idx, idxrec)

348

349

text = mdiff.bpatch(base_text, patch)

350

351

return text

352

353

354

355

def __len__(self):

356

"""Return number of revisions."""

357

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

358

if l % _RECORDSIZE:

359

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

360

if l < _RECORDSIZE:

361

raise RevfileError("no header present in index of %r" % (self.basename))

362

return int(l / _RECORDSIZE) - 1

363

364

365

def __getitem__(self, idx):

366

"""Index by sequence id returns the index field"""

367

## TODO: Can avoid seek if we just moved there...

368

self._seek_index(idx)

369

return self._read_next_index()

370

371

372

def _seek_index(self, idx):

373

if idx < 0:

374

raise RevfileError("invalid index %r" % idx)

375

self.idxfile.seek((idx + 1) * _RECORDSIZE)

376

377

378

def _read_next_index(self):

379

rec = self.idxfile.read(_RECORDSIZE)

380

if not rec:

381

raise IndexError("end of index file")

382

elif len(rec) != _RECORDSIZE:

383

raise RevfileError("short read of %d bytes getting index %d from %r"

384

% (len(rec), idx, self.basename))

385

386

return struct.unpack(">20sIIII12x", rec)

387

388

389

def dump(self, f=sys.stdout):

390

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

391

% tuple('idx sha1 base flags offset len'.split()))

392

f.write('-------- ---------------------------------------- ')

393

f.write('-------- -------- -------- --------\n')

394

395

for i, rec in enumerate(self):

396

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

397

if rec[1] == _NO_RECORD:

398

f.write("(none) ")

399

else:

400

f.write("#%-7d " % rec[1])

401

402

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

403

404

405

def total_text_size(self):

406

"""Return the sum of sizes of all file texts.

407

408

This is how much space they would occupy if they were stored without

409

delta and gzip compression.

410

411

As a side effect this completely validates the Revfile, checking that all

412

texts can be reproduced with the correct SHA-1."""

413

t = 0L

414

for idx in range(len(self)):

415

t += len(self.get(idx))

416

return t

417

418

419

420

def main(argv):

421

r = Revfile("testrev")

422

423

try:

424

cmd = argv[1]

425

except IndexError:

426

sys.stderr.write("usage: revfile dump\n"

427

" revfile add\n"

428

" revfile add-delta BASE\n"

429

" revfile get IDX\n"

430

" revfile find-sha HEX\n"

431

" revfile total-text-size\n")

432

return 1

433

434

if cmd == 'add':

435

new_idx = r.add(sys.stdin.read())

436

print new_idx

437

elif cmd == 'add-delta':

438

new_idx = r.add(sys.stdin.read(), int(argv[2]))

439

print new_idx

440

elif cmd == 'dump':

441

r.dump()

442

elif cmd == 'get':

443

try:

444

idx = int(argv[2])

445

except IndexError:

446

sys.stderr.write("usage: revfile get IDX\n")

447

return 1

448

449

if idx < 0 or idx >= len(r):

450

sys.stderr.write("invalid index %r\n" % idx)

451

return 1

452

453

sys.stdout.write(r.get(idx))

454

elif cmd == 'find-sha':

455

try:

456

s = unhexlify(argv[2])

457

except IndexError:

458

sys.stderr.write("usage: revfile find-sha HEX\n")

459

return 1

460

461

idx = r.find_sha(s)

462

if idx == _NO_RECORD:

463

sys.stderr.write("no such record\n")

464

return 1

465

else:

466

print idx

467

elif cmd == 'total-text-size':

468

print r.total_text_size()

469

else:

470

sys.stderr.write("unknown command %r\n" % cmd)

471

return 1

472

473

474

if __name__ == '__main__':

475

import sys

476

sys.exit(main(sys.argv) or 0)

Older »