61
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
62
index associated with a particular hash. For 100,000 revs we would be
63
63
able to find it in about 17 random reads, which is not too bad.
65
This performs pretty well except when trying to calculate deltas of
66
really large files. For that the main thing would be to plug in
67
something faster than difflib, which is after all pure Python.
68
Another approach is to just store the gzipped full text of big files,
69
though perhaps that's too perverse?
71
The iter method here will generally read through the whole index file
72
in one go. With readahead in the kernel and python/libc (typically
73
128kB) this means that there should be no seeks and often only one
74
read() call to get everything into memory.
78
67
# TODO: Something like pread() would make this slightly simpler and
79
68
# perhaps more efficient.
81
# TODO: Could also try to mmap things... Might be faster for the
82
# index in particular?
84
# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means
85
# rewriting existing records, which is not so nice.
87
# TODO: Something to check that regions identified in the index file
88
# completely butt up and do not overlap. Strictly it's not a problem
89
# if there are gaps and that can happen if we're interrupted while
90
# writing to the datafile. Overlapping would be very bad though.
92
# TODO: Shouldn't need to lock if we always write in append mode and
93
# then ftell after writing to see where it went. In any case we
94
# assume the whole branch is protected by a lock.
70
# TODO: Could also try to mmap things...
73
import sys, zlib, struct, mdiff, stat, os, sha
102
74
from binascii import hexlify, unhexlify
104
import bzrlib.mdiff as mdiff
122
# maximum number of patches in a row before recording a whole text.
126
94
class RevfileError(Exception):
129
class LimitHitException(Exception):
132
class Revfile(object):
133
def __init__(self, basename, mode):
100
def __init__(self, basename):
101
# TODO: Option to open readonly
134
103
# TODO: Lock file while open
136
105
# TODO: advise of random access
138
107
self.basename = basename
140
if mode not in ['r', 'w']:
141
raise RevfileError("invalid open mode %r" % mode)
144
109
idxname = basename + '.irev'
145
110
dataname = basename + '.drev'
151
116
raise RevfileError("half-assed revfile")
153
118
if not idx_exists:
155
raise RevfileError("Revfile %r does not exist" % basename)
157
119
self.idxfile = open(idxname, 'w+b')
158
120
self.datafile = open(dataname, 'w+b')
122
print 'init empty file'
160
123
self.idxfile.write(_HEADER)
161
124
self.idxfile.flush()
168
self.idxfile = open(idxname, diskmode)
169
self.datafile = open(dataname, diskmode)
126
self.idxfile = open(idxname, 'r+b')
127
self.datafile = open(dataname, 'r+b')
171
129
h = self.idxfile.read(_RECORDSIZE)
224
178
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
225
179
data_offset = self.datafile.tell()
227
assert isinstance(data, str) # not unicode or anything weird
181
assert isinstance(data, str) # not unicode or anything wierd
229
183
self.datafile.write(data)
230
184
self.datafile.flush()
244
def _add_full_text(self, text, text_sha, compress):
198
def _add_full_text(self, text, text_sha):
245
199
"""Add a full text to the file.
247
201
This is not compressed against any reference version.
250
204
return self._add_compressed(text_sha, text, _NO_RECORD, compress)
254
def _choose_base(self, seed, base):
256
if base == _NO_RECORD:
259
if idxrec[I_BASE] == _NO_RECORD:
262
base = idxrec[I_BASE]
265
return base # relative to this full text
269
207
def _add_delta(self, text, text_sha, base, compress):
270
208
"""Add a text stored relative to a previous text."""
271
209
self._check_index(base)
274
base_text = self.get(base, CHAIN_LIMIT)
275
except LimitHitException:
276
return self._add_full_text(text, text_sha, compress)
210
base_text = self.get(base)
278
211
data = mdiff.bdiff(base_text, text)
281
if True: # paranoid early check for bad diff
282
result = mdiff.bpatch(base_text, data)
283
assert result == text
286
213
# If the delta is larger than the text, we might as well just
287
214
# store the text. (OK, the delta might be more compressible,
293
220
return self._add_compressed(text_sha, data, base, compress)
296
def add(self, text, base=None, compress=True):
223
def add(self, text, base=_NO_RECORD, compress=True):
297
224
"""Add a new text to the revfile.
299
226
If the text is already present them its existing id is
320
242
# it's the same, in case someone ever breaks SHA-1.
321
243
return idx # already present
323
# base = self._choose_base(ord(text_sha[0]), base)
325
245
if base == _NO_RECORD:
326
246
return self._add_full_text(text, text_sha, compress)
332
def get(self, idx, recursion_limit=None):
333
"""Retrieve text of a previous revision.
335
If recursion_limit is an integer then walk back at most that
336
many revisions and then raise LimitHitException, indicating
337
that we ought to record a new file text instead of another
338
delta. Don't use this when trying to get out an existing
341
253
idxrec = self[idx]
342
254
base = idxrec[I_BASE]
343
255
if base == _NO_RECORD:
344
256
text = self._get_full_text(idx, idxrec)
346
text = self._get_patched(idx, idxrec, recursion_limit)
258
text = self._get_patched(idx, idxrec)
348
260
if sha.new(text).digest() != idxrec[I_SHA]:
349
raise RevfileError("corrupt SHA-1 digest on record %d in %s"
350
% (idx, self.basename))
261
raise RevfileError("corrupt SHA-1 digest on record %d"
388
def _get_patched(self, idx, idxrec, recursion_limit):
300
def _get_patched(self, idx, idxrec):
389
301
base = idxrec[I_BASE]
391
303
assert base < idx # no loops!
393
if recursion_limit == None:
396
sub_limit = recursion_limit - 1
398
raise LimitHitException()
400
base_text = self.get(base, sub_limit)
305
base_text = self.get(base)
401
306
patch = self._get_raw(idx, idxrec)
403
308
text = mdiff.bpatch(base_text, patch)
420
325
"""Index by sequence id returns the index field"""
421
326
## TODO: Can avoid seek if we just moved there...
422
327
self._seek_index(idx)
423
idxrec = self._read_next_index()
425
raise IndexError("no index %d" % idx)
328
return self._read_next_index()
430
331
def _seek_index(self, idx):
432
333
raise RevfileError("invalid index %r" % idx)
433
334
self.idxfile.seek((idx + 1) * _RECORDSIZE)
438
"""Read back all index records.
440
Do not seek the index file while this is underway!"""
441
## sys.stderr.write(" ** iter called ** \n")
444
idxrec = self._read_next_index()
450
337
def _read_next_index(self):
451
338
rec = self.idxfile.read(_RECORDSIZE)
340
raise IndexError("end of index file")
454
341
elif len(rec) != _RECORDSIZE:
455
342
raise RevfileError("short read of %d bytes getting index %d from %r"
456
343
% (len(rec), idx, self.basename))
472
359
f.write("#%-7d " % rec[1])
474
361
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
477
def total_text_size(self):
478
"""Return the sum of sizes of all file texts.
480
This is how much space they would occupy if they were stored without
481
delta and gzip compression.
483
As a side effect this completely validates the Revfile, checking that all
484
texts can be reproduced with the correct SHA-1."""
486
for idx in range(len(self)):
487
t += len(self.get(idx))
491
def check(self, pb=None):
492
"""Extract every version and check its hash."""
494
for i in range(total):
496
pb.update("check revision", i, total)
497
# the get method implicitly checks the SHA-1
366
r = Revfile("testrev")
508
370
except IndexError:
509
sys.stderr.write("usage: revfile dump REVFILE\n"
510
" revfile add REVFILE < INPUT\n"
511
" revfile add-delta REVFILE BASE < INPUT\n"
512
" revfile add-series REVFILE BASE FILE...\n"
513
" revfile get REVFILE IDX\n"
514
" revfile find-sha REVFILE HEX\n"
515
" revfile total-text-size REVFILE\n"
516
" revfile last REVFILE\n")
371
sys.stderr.write("usage: revfile dump\n"
373
" revfile add-delta BASE\n"
375
" revfile find-sha HEX\n")
519
if filename.endswith('.drev') or filename.endswith('.irev'):
520
filename = filename[:-5]
523
return Revfile(filename, 'w')
526
return Revfile(filename, 'r')
529
print rw().add(sys.stdin.read())
380
new_idx = r.add(sys.stdin.read())
530
382
elif cmd == 'add-delta':
531
print rw().add(sys.stdin.read(), int(argv[3]))
532
elif cmd == 'add-series':
537
rev = r.add(file(fn).read(), rev)
383
new_idx = r.add(sys.stdin.read(), int(argv[2]))
538
385
elif cmd == 'dump':
540
387
elif cmd == 'get':
543
390
except IndexError:
544
sys.stderr.write("usage: revfile get FILE IDX\n")
391
sys.stderr.write("usage: revfile get IDX\n")
549
394
if idx < 0 or idx >= len(r):
550
395
sys.stderr.write("invalid index %r\n" % idx)
553
398
sys.stdout.write(r.get(idx))
554
399
elif cmd == 'find-sha':
556
s = unhexlify(argv[3])
401
s = unhexlify(argv[2])
557
402
except IndexError:
558
sys.stderr.write("usage: revfile find-sha FILE HEX\n")
403
sys.stderr.write("usage: revfile find-sha HEX\n")
561
idx = ro().find_sha(s)
562
407
if idx == _NO_RECORD:
563
408
sys.stderr.write("no such record\n")
567
elif cmd == 'total-text-size':
568
print ro().total_text_size()
572
import bzrlib.progress
573
pb = bzrlib.progress.ProgressBar()
576
414
sys.stderr.write("unknown command %r\n" % cmd)