3
# (C) 2005 Canonical Ltd
5
# based on an idea by Matt Mackall
6
# modified to squish into bzr by Martin Pool
8
# This program is free software; you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation; either version 2 of the License, or
11
# (at your option) any later version.
13
# This program is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
18
# You should have received a copy of the GNU General Public License
19
# along with this program; if not, write to the Free Software
20
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
"""Packed file revision storage.
25
A Revfile holds the text history of a particular source file, such
26
as Makefile. It can represent a tree of text versions for that
27
file, allowing for microbranches within a single repository.
29
This is stored on disk as two files: an index file, and a data file.
30
The index file is short and always read completely into memory; the
31
data file is much longer and only the relevant bits of it,
32
identified by the index file, need to be read.
34
Each text version is identified by the SHA-1 of the full text of
35
that version. It also has a sequence number within the file.
37
The index file has a short header and then a sequence of fixed-length
40
* byte[20] SHA-1 of text (as binary, not hex)
41
* uint32 sequence number this is based on, or -1 for full text
42
* uint32 flags: 1=zlib compressed
43
* uint32 offset in text file of start
44
* uint32 length of compressed delta in text file
49
The header is also 48 bytes for tidyness and easy calculation.
51
Both the index and the text are only ever appended to; a consequence
52
is that sequence numbers are stable references. But not every
53
repository in the world will assign the same sequence numbers,
54
therefore the SHA-1 is the only universally unique reference.
56
This is meant to scale to hold 100,000 revisions of a single file, by
57
which time the index file will be ~4.8MB and a bit big to read
60
Some of the reserved fields could be used to implement a (semi?)
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
index associated with a particular hash. For 100,000 revs we would be
63
able to find it in about 17 random reads, which is not too bad.
65
This performs pretty well except when trying to calculate deltas of
66
really large files. For that the main thing would be to plug in
67
something faster than difflib, which is after all pure Python.
68
Another approach is to just store the gzipped full text of big files,
69
though perhaps that's too perverse?
71
The iter method here will generally read through the whole index file
72
in one go. With readahead in the kernel and python/libc (typically
73
128kB) this means that there should be no seeks and often only one
74
read() call to get everything into memory.
78
# TODO: Something like pread() would make this slightly simpler and
79
# perhaps more efficient.
81
# TODO: Could also try to mmap things... Might be faster for the
82
# index in particular?
84
# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means
85
# rewriting existing records, which is not so nice.
87
# TODO: Something to check that regions identified in the index file
88
# completely butt up and do not overlap. Strictly it's not a problem
89
# if there are gaps and that can happen if we're interrupted while
90
# writing to the datafile. Overlapping would be very bad though.
94
import sys, zlib, struct, mdiff, stat, os, sha
95
from binascii import hexlify, unhexlify
99
_HEADER = "bzr revfile v1\n"
100
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
101
_NO_RECORD = 0xFFFFFFFFL
103
# fields in the index record
112
# maximum number of patches in a row before recording a whole text.
116
class RevfileError(Exception):
119
class LimitHitException(Exception):
122
class Revfile(object):
123
def __init__(self, basename, mode):
124
# TODO: Lock file while open
126
# TODO: advise of random access
128
self.basename = basename
130
if mode not in ['r', 'w']:
131
raise RevfileError("invalid open mode %r" % mode)
134
idxname = basename + '.irev'
135
dataname = basename + '.drev'
137
idx_exists = os.path.exists(idxname)
138
data_exists = os.path.exists(dataname)
140
if idx_exists != data_exists:
141
raise RevfileError("half-assed revfile")
145
raise RevfileError("Revfile %r does not exist" % basename)
147
self.idxfile = open(idxname, 'w+b')
148
self.datafile = open(dataname, 'w+b')
150
print 'init empty file'
151
self.idxfile.write(_HEADER)
159
self.idxfile = open(idxname, diskmode)
160
self.datafile = open(dataname, diskmode)
162
h = self.idxfile.read(_RECORDSIZE)
164
raise RevfileError("bad header %r in index of %r"
165
% (h, self.basename))
168
def _check_index(self, idx):
169
if idx < 0 or idx > len(self):
170
raise RevfileError("invalid index %r" % idx)
172
def _check_write(self):
174
raise RevfileError("%r is open readonly" % self.basename)
177
def find_sha(self, s):
178
assert isinstance(s, str)
181
for idx, idxrec in enumerate(self):
182
if idxrec[I_SHA] == s:
189
def _add_compressed(self, text_sha, data, base, compress):
190
# well, maybe compress
195
# don't do compression if it's too small; it's unlikely to win
196
# enough to be worthwhile
197
compr_data = zlib.compress(data)
198
compr_len = len(compr_data)
199
if compr_len < data_len:
202
##print '- compressed %d -> %d, %.1f%%' \
203
## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)
204
return self._add_raw(text_sha, data, base, flags)
208
def _add_raw(self, text_sha, data, base, flags):
209
"""Add pre-processed data, can be either full text or delta.
211
This does the compression if that makes sense."""
213
self.datafile.seek(0, 2) # to end
214
self.idxfile.seek(0, 2)
215
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
216
data_offset = self.datafile.tell()
218
assert isinstance(data, str) # not unicode or anything weird
220
self.datafile.write(data)
221
self.datafile.flush()
223
assert isinstance(text_sha, str)
225
entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))
226
assert len(entry) == _RECORDSIZE
228
self.idxfile.write(entry)
235
def _add_full_text(self, text, text_sha, compress):
236
"""Add a full text to the file.
238
This is not compressed against any reference version.
240
Returns the index for that text."""
241
return self._add_compressed(text_sha, text, _NO_RECORD, compress)
245
def _choose_base(self, seed, base):
247
if base == _NO_RECORD:
250
if idxrec[I_BASE] == _NO_RECORD:
253
base = idxrec[I_BASE]
256
return base # relative to this full text
260
def _add_delta(self, text, text_sha, base, compress):
261
"""Add a text stored relative to a previous text."""
262
self._check_index(base)
265
base_text = self.get(base, CHAIN_LIMIT)
266
except LimitHitException:
267
return self._add_full_text(text, text_sha, compress)
269
data = mdiff.bdiff(base_text, text)
271
# If the delta is larger than the text, we might as well just
272
# store the text. (OK, the delta might be more compressible,
273
# but the overhead of applying it probably still makes it
274
# bad, and I don't want to compress both of them to find out.)
275
if len(data) >= len(text):
276
return self._add_full_text(text, text_sha, compress)
278
return self._add_compressed(text_sha, data, base, compress)
281
def add(self, text, base=_NO_RECORD, compress=True):
282
"""Add a new text to the revfile.
284
If the text is already present them its existing id is
285
returned and the file is not changed.
287
If compress is true then gzip compression will be used if it
290
If a base index is specified, that text *may* be used for
291
delta compression of the new text. Delta compression will
292
only be used if it would be a size win and if the existing
293
base is not at too long of a delta chain already.
297
text_sha = sha.new(text).digest()
299
idx = self.find_sha(text_sha)
300
if idx != _NO_RECORD:
301
# TODO: Optional paranoid mode where we read out that record and make sure
302
# it's the same, in case someone ever breaks SHA-1.
303
return idx # already present
305
# base = self._choose_base(ord(text_sha[0]), base)
307
if base == _NO_RECORD:
308
return self._add_full_text(text, text_sha, compress)
310
return self._add_delta(text, text_sha, base, compress)
314
def get(self, idx, recursion_limit=None):
315
"""Retrieve text of a previous revision.
317
If recursion_limit is an integer then walk back at most that
318
many revisions and then raise LimitHitException, indicating
319
that we ought to record a new file text instead of another
320
delta. Don't use this when trying to get out an existing
324
base = idxrec[I_BASE]
325
if base == _NO_RECORD:
326
text = self._get_full_text(idx, idxrec)
328
text = self._get_patched(idx, idxrec, recursion_limit)
330
if sha.new(text).digest() != idxrec[I_SHA]:
331
raise RevfileError("corrupt SHA-1 digest on record %d"
338
def _get_raw(self, idx, idxrec):
339
flags = idxrec[I_FLAGS]
341
raise RevfileError("unsupported index flags %#x on index %d"
348
self.datafile.seek(idxrec[I_OFFSET])
350
data = self.datafile.read(l)
352
raise RevfileError("short read %d of %d "
353
"getting text for record %d in %r"
354
% (len(data), l, idx, self.basename))
357
data = zlib.decompress(data)
362
def _get_full_text(self, idx, idxrec):
363
assert idxrec[I_BASE] == _NO_RECORD
365
text = self._get_raw(idx, idxrec)
370
def _get_patched(self, idx, idxrec, recursion_limit):
371
base = idxrec[I_BASE]
373
assert base < idx # no loops!
375
if recursion_limit == None:
378
sub_limit = recursion_limit - 1
380
raise LimitHitException()
382
base_text = self.get(base, sub_limit)
383
patch = self._get_raw(idx, idxrec)
385
text = mdiff.bpatch(base_text, patch)
392
"""Return number of revisions."""
393
l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
395
raise RevfileError("bad length %d on index of %r" % (l, self.basename))
397
raise RevfileError("no header present in index of %r" % (self.basename))
398
return int(l / _RECORDSIZE) - 1
401
def __getitem__(self, idx):
402
"""Index by sequence id returns the index field"""
403
## TODO: Can avoid seek if we just moved there...
404
self._seek_index(idx)
405
idxrec = self._read_next_index()
407
raise IndexError("no index %d" % idx)
412
def _seek_index(self, idx):
414
raise RevfileError("invalid index %r" % idx)
415
self.idxfile.seek((idx + 1) * _RECORDSIZE)
420
"""Read back all index records.
422
Do not seek the index file while this is underway!"""
423
## sys.stderr.write(" ** iter called ** \n")
426
idxrec = self._read_next_index()
432
def _read_next_index(self):
433
rec = self.idxfile.read(_RECORDSIZE)
436
elif len(rec) != _RECORDSIZE:
437
raise RevfileError("short read of %d bytes getting index %d from %r"
438
% (len(rec), idx, self.basename))
440
return struct.unpack(">20sIIII12x", rec)
443
def dump(self, f=sys.stdout):
444
f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'
445
% tuple('idx sha1 base flags offset len'.split()))
446
f.write('-------- ---------------------------------------- ')
447
f.write('-------- -------- -------- --------\n')
449
for i, rec in enumerate(self):
450
f.write("#%-7d %40s " % (i, hexlify(rec[0])))
451
if rec[1] == _NO_RECORD:
454
f.write("#%-7d " % rec[1])
456
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
459
def total_text_size(self):
460
"""Return the sum of sizes of all file texts.
462
This is how much space they would occupy if they were stored without
463
delta and gzip compression.
465
As a side effect this completely validates the Revfile, checking that all
466
texts can be reproduced with the correct SHA-1."""
468
for idx in range(len(self)):
469
t += len(self.get(idx))
479
sys.stderr.write("usage: revfile dump REVFILE\n"
480
" revfile add REVFILE < INPUT\n"
481
" revfile add-delta REVFILE BASE < INPUT\n"
482
" revfile add-series REVFILE BASE FILE...\n"
483
" revfile get REVFILE IDX\n"
484
" revfile find-sha REVFILE HEX\n"
485
" revfile total-text-size REVFILE\n"
486
" revfile last REVFILE\n")
490
return Revfile(filename, 'w')
493
return Revfile(filename, 'r')
496
print rw().add(sys.stdin.read())
497
elif cmd == 'add-delta':
498
print rw().add(sys.stdin.read(), int(argv[3]))
499
elif cmd == 'add-series':
504
rev = r.add(file(fn).read(), rev)
511
sys.stderr.write("usage: revfile get FILE IDX\n")
516
if idx < 0 or idx >= len(r):
517
sys.stderr.write("invalid index %r\n" % idx)
520
sys.stdout.write(r.get(idx))
521
elif cmd == 'find-sha':
523
s = unhexlify(argv[3])
525
sys.stderr.write("usage: revfile find-sha FILE HEX\n")
528
idx = ro().find_sha(s)
529
if idx == _NO_RECORD:
530
sys.stderr.write("no such record\n")
534
elif cmd == 'total-text-size':
535
print ro().total_text_size()
539
sys.stderr.write("unknown command %r\n" % cmd)
543
if __name__ == '__main__':
545
sys.exit(main(sys.argv) or 0)