61
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
62
index associated with a particular hash. For 100,000 revs we would be
63
63
able to find it in about 17 random reads, which is not too bad.
65
This performs pretty well except when trying to calculate deltas of
66
really large files. For that the main thing would be to plug in
67
something faster than difflib, which is after all pure Python.
68
Another approach is to just store the gzipped full text of big files,
69
though perhaps that's too perverse?
71
The iter method here will generally read through the whole index file
72
in one go. With readahead in the kernel and python/libc (typically
73
128kB) this means that there should be no seeks and often only one
74
read() call to get everything into memory.
78
67
# TODO: Something like pread() would make this slightly simpler and
79
68
# perhaps more efficient.
81
# TODO: Could also try to mmap things... Might be faster for the
82
# index in particular?
84
# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means
85
# rewriting existing records, which is not so nice.
87
# TODO: Something to check that regions identified in the index file
88
# completely butt up and do not overlap. Strictly it's not a problem
89
# if there are gaps and that can happen if we're interrupted while
90
# writing to the datafile. Overlapping would be very bad though.
70
# TODO: Could also try to mmap things...
94
73
import sys, zlib, struct, mdiff, stat, os, sha
95
74
from binascii import hexlify, unhexlify
99
80
_HEADER = "bzr revfile v1\n"
165
134
% (h, self.basename))
138
def revision(self, rev):
139
base = self.index[rev][0]
140
start = self.index[base][1]
141
end = self.index[rev][1] + self.index[rev][2]
142
f = open(self.datafile())
145
data = f.read(end - start)
147
last = self.index[base][2]
148
text = zlib.decompress(data[:last])
150
for r in range(base + 1, rev + 1):
152
b = zlib.decompress(data[last:last + s])
153
text = mdiff.bpatch(text, b)
168
159
def _check_index(self, idx):
169
160
if idx < 0 or idx > len(self):
170
161
raise RevfileError("invalid index %r" % idx)
172
def _check_write(self):
174
raise RevfileError("%r is open readonly" % self.basename)
177
164
def find_sha(self, s):
178
165
assert isinstance(s, str)
182
169
if idxrec[I_SHA] == s:
189
def _add_compressed(self, text_sha, data, base, compress):
190
# well, maybe compress
195
# don't do compression if it's too small; it's unlikely to win
196
# enough to be worthwhile
197
compr_data = zlib.compress(data)
198
compr_len = len(compr_data)
199
if compr_len < data_len:
202
##print '- compressed %d -> %d, %.1f%%' \
203
## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)
204
return self._add_raw(text_sha, data, base, flags)
208
def _add_raw(self, text_sha, data, base, flags):
175
def _add_common(self, text_sha, data, base):
209
176
"""Add pre-processed data, can be either full text or delta.
211
178
This does the compression if that makes sense."""
182
# don't do compression if it's too small; it's unlikely to win
183
# enough to be worthwhile
184
compr_data = zlib.compress(data)
185
if len(compr_data) < len(data):
213
190
self.datafile.seek(0, 2) # to end
214
191
self.idxfile.seek(0, 2)
215
192
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
216
193
data_offset = self.datafile.tell()
218
assert isinstance(data, str) # not unicode or anything weird
195
assert isinstance(data, str) # not unicode or anything wierd
220
197
self.datafile.write(data)
221
198
self.datafile.flush()
235
def _add_full_text(self, text, text_sha, compress):
212
def _add_full_text(self, text, text_sha):
236
213
"""Add a full text to the file.
238
215
This is not compressed against any reference version.
240
217
Returns the index for that text."""
241
return self._add_compressed(text_sha, text, _NO_RECORD, compress)
245
def _choose_base(self, seed, base):
247
if base == _NO_RECORD:
250
if idxrec[I_BASE] == _NO_RECORD:
253
base = idxrec[I_BASE]
256
return base # relative to this full text
260
def _add_delta(self, text, text_sha, base, compress):
218
return self._add_common(text_sha, text, _NO_RECORD)
221
def _add_delta(self, text, text_sha, base):
261
222
"""Add a text stored relative to a previous text."""
262
223
self._check_index(base)
265
base_text = self.get(base, CHAIN_LIMIT)
266
except LimitHitException:
267
return self._add_full_text(text, text_sha, compress)
224
base_text = self.get(base)
269
225
data = mdiff.bdiff(base_text, text)
271
# If the delta is larger than the text, we might as well just
272
# store the text. (OK, the delta might be more compressible,
273
# but the overhead of applying it probably still makes it
274
# bad, and I don't want to compress both of them to find out.)
275
if len(data) >= len(text):
276
return self._add_full_text(text, text_sha, compress)
278
return self._add_compressed(text_sha, data, base, compress)
281
def add(self, text, base=_NO_RECORD, compress=True):
282
"""Add a new text to the revfile.
284
If the text is already present them its existing id is
285
returned and the file is not changed.
287
If compress is true then gzip compression will be used if it
290
If a base index is specified, that text *may* be used for
291
delta compression of the new text. Delta compression will
292
only be used if it would be a size win and if the existing
293
base is not at too long of a delta chain already.
226
return self._add_common(text_sha, data, base)
229
def add(self, text, base=_NO_RECORD):
297
230
text_sha = sha.new(text).digest()
299
232
idx = self.find_sha(text_sha)
300
233
if idx != _NO_RECORD:
301
# TODO: Optional paranoid mode where we read out that record and make sure
302
# it's the same, in case someone ever breaks SHA-1.
303
234
return idx # already present
305
# base = self._choose_base(ord(text_sha[0]), base)
307
236
if base == _NO_RECORD:
308
return self._add_full_text(text, text_sha, compress)
310
return self._add_delta(text, text_sha, base, compress)
314
def get(self, idx, recursion_limit=None):
315
"""Retrieve text of a previous revision.
317
If recursion_limit is an integer then walk back at most that
318
many revisions and then raise LimitHitException, indicating
319
that we ought to record a new file text instead of another
320
delta. Don't use this when trying to get out an existing
237
return self._add_full_text(text, text_sha)
239
return self._add_delta(text, text_sha, base)
242
def addrevision(self, text, changeset):
247
data = zlib.compress(text)
250
prev = self.revision(t)
251
data = zlib.compress(mdiff.bdiff(prev, text))
252
base = self.index[t][0]
256
offset = self.index[t][1] + self.index[t][2]
258
self.index.append((base, offset, len(data), changeset))
259
entry = struct.pack(">llll", base, offset, len(data), changeset)
261
open(self.indexfile(), "a").write(entry)
262
open(self.datafile(), "a").write(data)
323
267
idxrec = self[idx]
324
268
base = idxrec[I_BASE]
325
269
if base == _NO_RECORD:
326
270
text = self._get_full_text(idx, idxrec)
328
text = self._get_patched(idx, idxrec, recursion_limit)
272
text = self._get_patched(idx, idxrec)
330
274
if sha.new(text).digest() != idxrec[I_SHA]:
331
275
raise RevfileError("corrupt SHA-1 digest on record %d"
402
333
"""Index by sequence id returns the index field"""
403
334
## TODO: Can avoid seek if we just moved there...
404
335
self._seek_index(idx)
405
idxrec = self._read_next_index()
407
raise IndexError("no index %d" % idx)
336
return self._read_next_index()
412
339
def _seek_index(self, idx):
414
341
raise RevfileError("invalid index %r" % idx)
415
342
self.idxfile.seek((idx + 1) * _RECORDSIZE)
420
"""Read back all index records.
422
Do not seek the index file while this is underway!"""
423
## sys.stderr.write(" ** iter called ** \n")
426
idxrec = self._read_next_index()
432
345
def _read_next_index(self):
433
346
rec = self.idxfile.read(_RECORDSIZE)
348
raise IndexError("end of index file")
436
349
elif len(rec) != _RECORDSIZE:
437
350
raise RevfileError("short read of %d bytes getting index %d from %r"
438
351
% (len(rec), idx, self.basename))
454
367
f.write("#%-7d " % rec[1])
456
369
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
459
def total_text_size(self):
460
"""Return the sum of sizes of all file texts.
462
This is how much space they would occupy if they were stored without
463
delta and gzip compression.
465
As a side effect this completely validates the Revfile, checking that all
466
texts can be reproduced with the correct SHA-1."""
468
for idx in range(len(self)):
469
t += len(self.get(idx))
374
r = Revfile("testrev")
478
378
except IndexError:
479
sys.stderr.write("usage: revfile dump REVFILE\n"
480
" revfile add REVFILE < INPUT\n"
481
" revfile add-delta REVFILE BASE < INPUT\n"
482
" revfile add-series REVFILE BASE FILE...\n"
483
" revfile get REVFILE IDX\n"
484
" revfile find-sha REVFILE HEX\n"
485
" revfile total-text-size REVFILE\n"
486
" revfile last REVFILE\n")
379
sys.stderr.write("usage: revfile dump\n"
381
" revfile add-delta BASE\n"
383
" revfile find-sha HEX\n")
490
return Revfile(filename, 'w')
493
return Revfile(filename, 'r')
496
print rw().add(sys.stdin.read())
388
new_idx = r.add(sys.stdin.read())
389
print 'added idx %d' % new_idx
497
390
elif cmd == 'add-delta':
498
print rw().add(sys.stdin.read(), int(argv[3]))
499
elif cmd == 'add-series':
504
rev = r.add(file(fn).read(), rev)
391
new_idx = r.add(sys.stdin.read(), int(argv[2]))
392
print 'added idx %d' % new_idx
505
393
elif cmd == 'dump':
507
395
elif cmd == 'get':
510
398
except IndexError:
511
sys.stderr.write("usage: revfile get FILE IDX\n")
399
sys.stderr.write("usage: revfile get IDX\n")
516
402
if idx < 0 or idx >= len(r):
517
403
sys.stderr.write("invalid index %r\n" % idx)