3
# (C) 2005 Matt Mackall
5
# modified to squish into bzr by Martin Pool
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
"""Packed file revision storage.
24
A Revfile holds the text history of a particular source file, such
25
as Makefile. It can represent a tree of text versions for that
26
file, allowing for microbranches within a single repository.
28
This is stored on disk as two files: an index file, and a data file.
29
The index file is short and always read completely into memory; the
30
data file is much longer and only the relevant bits of it,
31
identified by the index file, need to be read.
33
Each text version is identified by the SHA-1 of the full text of
34
that version. It also has a sequence number within the file.
36
The index file has a short header and then a sequence of fixed-length
39
* byte[20] SHA-1 of text (as binary, not hex)
40
* uint32 sequence number this is based on, or -1 for full text
41
* uint32 flags: 1=zlib compressed
42
* uint32 offset in text file of start
43
* uint32 length of compressed delta in text file
48
The header is also 48 bytes for tidyness.
50
Both the index and the text are only ever appended to; a consequence
51
is that sequence numbers are stable references. But not every
52
repository in the world will assign the same sequence numbers,
53
therefore the SHA-1 is the only universally unique reference.
55
This is meant to scale to hold 100,000 revisions of a single file, by
56
which time the index file will be ~4.8MB and a bit big to read
59
Some of the reserved fields could be used to implement a (semi?)
60
balanced tree indexed by SHA1 so we can much more efficiently find the
61
index associated with a particular hash. For 100,000 revs we would be
62
able to find it in about 17 random reads, which is not too bad.
67
import sys, zlib, struct, mdiff, stat, os, sha
68
from binascii import hexlify
74
_HEADER = "bzr revfile v1\n"
75
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
77
class RevfileError(Exception):
81
def __init__(self, basename):
82
self.basename = basename
83
self.idxfile = open(basename + '.irev', 'r+b')
84
self.datafile = open(basename + '.drev', 'r+b')
85
if self.last_idx() == -1:
86
print 'init empty file'
87
self.idxfile.write(_HEADER)
90
h = self.idxfile.read(_RECORDSIZE)
92
raise RevfileError("bad header %r in index of %r"
96
"""Return last index already present, or -1 if none."""
97
l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
101
raise RevfileError("bad length %d on index of %r" % (l, self.basename))
102
return (l / _RECORDSIZE) - 1
105
def revision(self, rev):
106
base = self.index[rev][0]
107
start = self.index[base][1]
108
end = self.index[rev][1] + self.index[rev][2]
109
f = open(self.datafile())
112
data = f.read(end - start)
114
last = self.index[base][2]
115
text = zlib.decompress(data[:last])
117
for r in range(base + 1, rev + 1):
119
b = zlib.decompress(data[last:last + s])
120
text = mdiff.bpatch(text, b)
126
def add_full_text(self, t):
127
"""Add a full text to the file.
129
This is not compressed against any reference version.
131
Returns the index for that text."""
132
idx = self.last_idx() + 1
133
self.datafile.seek(0, 2) # to end
134
self.idxfile.seek(0, 2)
135
assert self.idxfile.tell() == _RECORDSIZE * idx
136
data_offset = self.datafile.tell()
138
assert isinstance(t, str) # not unicode or anything wierd
140
self.datafile.write(t)
141
self.datafile.flush()
143
entry = sha.new(t).digest()
144
entry += struct.pack(">llll12x", 0, 0, data_offset, len(t))
145
assert len(entry) == _RECORDSIZE
147
self.idxfile.write(entry)
154
return int(self.last_idx())
156
def __getitem__(self, idx):
157
self.idxfile.seek((idx + 1) * _RECORDSIZE)
158
rec = self.idxfile.read(_RECORDSIZE)
159
if len(rec) != _RECORDSIZE:
160
raise RevfileError("short read of %d bytes getting index %d from %r"
161
% (len(rec), idx, self.basename))
162
return struct.unpack(">20sllll12x", rec)
166
def addrevision(self, text, changeset):
171
data = zlib.compress(text)
174
prev = self.revision(t)
175
data = zlib.compress(mdiff.bdiff(prev, text))
176
base = self.index[t][0]
180
offset = self.index[t][1] + self.index[t][2]
182
self.index.append((base, offset, len(data), changeset))
183
entry = struct.pack(">llll", base, offset, len(data), changeset)
185
open(self.indexfile(), "a").write(entry)
186
open(self.datafile(), "a").write(data)
189
print '%-8s %-40s %-8s %-8s %-8s %-8s' \
190
% tuple('idx sha1 base flags offset len'.split())
191
print '-'*8, '-'*40, ('-'*8 + ' ')*4
192
for i in range(len(self)):
194
print "#%-7d %40s #%-7d %08x %8d %8d " \
195
% (i, hexlify(rec[0]), rec[1], rec[2], rec[3], rec[4])
200
r = Revfile("testrev")
202
sys.stderr.write("usage: revfile dump\n"
207
new_idx = r.add_full_text(sys.stdin.read())
208
print 'added idx %d' % new_idx
209
elif argv[1] == 'dump':
212
sys.stderr.write("unknown command %r\n" % argv[1])
216
if __name__ == '__main__':