52
52
is that sequence numbers are stable references. But not every
53
53
repository in the world will assign the same sequence numbers,
54
54
therefore the SHA-1 is the only universally unique reference.
55
The iter method here will generally read through the whole index file
56
in one go. With readahead in the kernel and python/libc (typically
57
128kB) this means that there should be no seeks and often only one
58
read() call to get everything into memory.
56
This is meant to scale to hold 100,000 revisions of a single file, by
57
which time the index file will be ~4.8MB and a bit big to read
60
Some of the reserved fields could be used to implement a (semi?)
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
index associated with a particular hash. For 100,000 revs we would be
63
able to find it in about 17 random reads, which is not too bad.
62
67
# TODO: Something like pread() would make this slightly simpler and
63
68
# perhaps more efficient.
65
# TODO: Could also try to mmap things... Might be faster for the
66
# index in particular?
68
# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means
69
# rewriting existing records, which is not so nice.
71
# TODO: Something to check that regions identified in the index file
72
# completely butt up and do not overlap. Strictly it's not a problem
73
# if there are gaps and that can happen if we're interrupted while
74
# writing to the datafile. Overlapping would be very bad though.
70
# TODO: Could also try to mmap things...
78
73
import sys, zlib, struct, mdiff, stat, os, sha
98
# maximum number of patches in a row before recording a whole text.
102
94
class RevfileError(Exception):
105
class LimitHitException(Exception):
108
class Revfile(object):
109
def __init__(self, basename, mode):
100
def __init__(self, basename):
101
# TODO: Option to open readonly
110
103
# TODO: Lock file while open
112
105
# TODO: advise of random access
114
107
self.basename = basename
116
if mode not in ['r', 'w']:
117
raise RevfileError("invalid open mode %r" % mode)
120
109
idxname = basename + '.irev'
121
110
dataname = basename + '.drev'
123
114
idx_exists = os.path.exists(idxname)
124
115
data_exists = os.path.exists(dataname)
151
134
% (h, self.basename))
138
def revision(self, rev):
139
base = self.index[rev][0]
140
start = self.index[base][1]
141
end = self.index[rev][1] + self.index[rev][2]
142
f = open(self.datafile())
145
data = f.read(end - start)
147
last = self.index[base][2]
148
text = zlib.decompress(data[:last])
150
for r in range(base + 1, rev + 1):
152
b = zlib.decompress(data[last:last + s])
153
text = mdiff.bpatch(text, b)
154
159
def _check_index(self, idx):
155
160
if idx < 0 or idx > len(self):
156
161
raise RevfileError("invalid index %r" % idx)
158
def _check_write(self):
160
raise RevfileError("%r is open readonly" % self.basename)
163
164
def find_sha(self, s):
164
165
assert isinstance(s, str)
168
169
if idxrec[I_SHA] == s:
175
def _add_compressed(self, text_sha, data, base, compress):
176
# well, maybe compress
181
# don't do compression if it's too small; it's unlikely to win
182
# enough to be worthwhile
183
compr_data = zlib.compress(data)
184
compr_len = len(compr_data)
185
if compr_len < data_len:
188
##print '- compressed %d -> %d, %.1f%%' \
189
## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)
190
return self._add_raw(text_sha, data, base, flags)
194
def _add_raw(self, text_sha, data, base, flags):
175
def _add_common(self, text_sha, data, base):
195
176
"""Add pre-processed data, can be either full text or delta.
197
178
This does the compression if that makes sense."""
183
# don't do compression if it's too small; it's unlikely to win
184
# enough to be worthwhile
185
compr_data = zlib.compress(data)
186
compr_len = len(compr_data)
187
if compr_len < data_len:
190
print '- compressed %d -> %d, %.1f%%' \
191
% (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)
199
194
self.datafile.seek(0, 2) # to end
200
195
self.idxfile.seek(0, 2)
201
196
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
202
197
data_offset = self.datafile.tell()
204
assert isinstance(data, str) # not unicode or anything weird
199
assert isinstance(data, str) # not unicode or anything wierd
206
201
self.datafile.write(data)
207
202
self.datafile.flush()
221
def _add_full_text(self, text, text_sha, compress):
216
def _add_full_text(self, text, text_sha):
222
217
"""Add a full text to the file.
224
219
This is not compressed against any reference version.
226
221
Returns the index for that text."""
227
return self._add_compressed(text_sha, text, _NO_RECORD, compress)
230
def _add_delta(self, text, text_sha, base, compress):
222
return self._add_common(text_sha, text, _NO_RECORD)
225
def _add_delta(self, text, text_sha, base):
231
226
"""Add a text stored relative to a previous text."""
232
227
self._check_index(base)
235
base_text = self.get(base, recursion_limit=CHAIN_LIMIT)
236
except LimitHitException:
237
return self._add_full_text(text, text_sha, compress)
228
base_text = self.get(base)
239
229
data = mdiff.bdiff(base_text, text)
241
# If the delta is larger than the text, we might as well just
242
# store the text. (OK, the delta might be more compressible,
243
# but the overhead of applying it probably still makes it
244
# bad, and I don't want to compress both of them to find out.)
245
if len(data) >= len(text):
246
return self._add_full_text(text, text_sha, compress)
248
return self._add_compressed(text_sha, data, base, compress)
251
def add(self, text, base=_NO_RECORD, compress=True):
252
"""Add a new text to the revfile.
254
If the text is already present them its existing id is
255
returned and the file is not changed.
257
If compress is true then gzip compression will be used if it
260
If a base index is specified, that text *may* be used for
261
delta compression of the new text. Delta compression will
262
only be used if it would be a size win and if the existing
263
base is not at too long of a delta chain already.
230
return self._add_common(text_sha, data, base)
233
def add(self, text, base=_NO_RECORD):
267
234
text_sha = sha.new(text).digest()
269
236
idx = self.find_sha(text_sha)
270
237
if idx != _NO_RECORD:
271
# TODO: Optional paranoid mode where we read out that record and make sure
272
# it's the same, in case someone ever breaks SHA-1.
273
238
return idx # already present
275
240
if base == _NO_RECORD:
276
return self._add_full_text(text, text_sha, compress)
278
return self._add_delta(text, text_sha, base, compress)
282
def get(self, idx, recursion_limit=None):
283
"""Retrieve text of a previous revision.
285
If recursion_limit is an integer then walk back at most that
286
many revisions and then raise LimitHitException, indicating
287
that we ought to record a new file text instead of another
288
delta. Don't use this when trying to get out an existing
241
return self._add_full_text(text, text_sha)
243
return self._add_delta(text, text_sha, base)
246
def addrevision(self, text, changeset):
251
data = zlib.compress(text)
254
prev = self.revision(t)
255
data = zlib.compress(mdiff.bdiff(prev, text))
256
base = self.index[t][0]
260
offset = self.index[t][1] + self.index[t][2]
262
self.index.append((base, offset, len(data), changeset))
263
entry = struct.pack(">llll", base, offset, len(data), changeset)
265
open(self.indexfile(), "a").write(entry)
266
open(self.datafile(), "a").write(data)
291
271
idxrec = self[idx]
292
272
base = idxrec[I_BASE]
293
273
if base == _NO_RECORD:
294
274
text = self._get_full_text(idx, idxrec)
296
text = self._get_patched(idx, idxrec, recursion_limit)
276
text = self._get_patched(idx, idxrec)
298
278
if sha.new(text).digest() != idxrec[I_SHA]:
299
279
raise RevfileError("corrupt SHA-1 digest on record %d"
370
343
"""Index by sequence id returns the index field"""
371
344
## TODO: Can avoid seek if we just moved there...
372
345
self._seek_index(idx)
373
idxrec = self._read_next_index()
346
return self._read_next_index()
380
349
def _seek_index(self, idx):
382
351
raise RevfileError("invalid index %r" % idx)
383
352
self.idxfile.seek((idx + 1) * _RECORDSIZE)
388
"""Read back all index records.
390
Do not seek the index file while this is underway!"""
391
sys.stderr.write(" ** iter called ** \n")
394
idxrec = self._read_next_index()
400
355
def _read_next_index(self):
401
356
rec = self.idxfile.read(_RECORDSIZE)
358
raise IndexError("end of index file")
404
359
elif len(rec) != _RECORDSIZE:
405
360
raise RevfileError("short read of %d bytes getting index %d from %r"
406
361
% (len(rec), idx, self.basename))