~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: mbp at sourcefrog
Date: 2005-04-09 02:49:04 UTC
Revision ID: mbp@sourcefrog.net-20050409024904-a73e87ce87a0077d9986b40e

- experimental compressed Revfile support
not integrated yet

files added:
bzrlib/mdiff.py

bzrlib/revfile.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

"""

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

class RevfileError(Exception):

pass

class Revfile:

def __init__(self, basename):

self.basename = basename

self.idxfile = open(basename + '.irev', 'r+b')

self.datafile = open(basename + '.drev', 'r+b')

if self.last_idx() == -1:

print 'init empty file'

self.idxfile.write(_HEADER)

self.idxfile.flush()

else:

h = self.idxfile.read(_RECORDSIZE)

if h != _HEADER:

raise RevfileError("bad header %r in index of %r"

% (h, self.basename))

def last_idx(self):

"""Return last index already present, or -1 if none."""

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

if l == 0:

return -1

100

if l % _RECORDSIZE:

101

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

102

return (l / _RECORDSIZE) - 1

103

104

105

def revision(self, rev):

106

base = self.index[rev][0]

107

start = self.index[base][1]

108

end = self.index[rev][1] + self.index[rev][2]

109

f = open(self.datafile())

110

111

f.seek(start)

112

data = f.read(end - start)

113

114

last = self.index[base][2]

115

text = zlib.decompress(data[:last])

116

117

for r in range(base + 1, rev + 1):

118

s = self.index[r][2]

119

b = zlib.decompress(data[last:last + s])

120

text = mdiff.bpatch(text, b)

121

last = last + s

122

123

return text

124

125

126

def add_full_text(self, t):

127

"""Add a full text to the file.

128

129

This is not compressed against any reference version.

130

131

Returns the index for that text."""

132

idx = self.last_idx() + 1

133

self.datafile.seek(0, 2) # to end

134

self.idxfile.seek(0, 2)

135

assert self.idxfile.tell() == _RECORDSIZE * idx

136

data_offset = self.datafile.tell()

137

138

assert isinstance(t, str) # not unicode or anything wierd

139

140

self.datafile.write(t)

141

self.datafile.flush()

142

143

entry = sha.new(t).digest()

144

entry += struct.pack(">llll12x", 0, 0, data_offset, len(t))

145

assert len(entry) == _RECORDSIZE

146

147

self.idxfile.write(entry)

148

self.idxfile.flush()

149

150

return idx

151

152

153

def __len__(self):

154

return int(self.last_idx())

155

156

def __getitem__(self, idx):

157

self.idxfile.seek((idx + 1) * _RECORDSIZE)

158

rec = self.idxfile.read(_RECORDSIZE)

159

if len(rec) != _RECORDSIZE:

160

raise RevfileError("short read of %d bytes getting index %d from %r"

161

% (len(rec), idx, self.basename))

162

return struct.unpack(">20sllll12x", rec)

163

164

165

166

def addrevision(self, text, changeset):

167

t = self.tip()

168

n = t + 1

169

170

if not n % factor:

171

data = zlib.compress(text)

172

base = n

173

else:

174

prev = self.revision(t)

175

data = zlib.compress(mdiff.bdiff(prev, text))

176

base = self.index[t][0]

177

178

offset = 0

179

if t >= 0:

180

offset = self.index[t][1] + self.index[t][2]

181

182

self.index.append((base, offset, len(data), changeset))

183

entry = struct.pack(">llll", base, offset, len(data), changeset)

184

185

open(self.indexfile(), "a").write(entry)

186

open(self.datafile(), "a").write(data)

187

188

def dump(self):

189

print '%-8s %-40s %-8s %-8s %-8s %-8s' \

190

% tuple('idx sha1 base flags offset len'.split())

191

print '-'*8, '-'*40, ('-'*8 + ' ')*4

192

for i in range(len(self)):

193

rec = self[i]

194

print "#%-7d %40s #%-7d %08x %8d %8d " \

195

% (i, hexlify(rec[0]), rec[1], rec[2], rec[3], rec[4])

196

197

198

199

def main(argv):

200

r = Revfile("testrev")

201

if len(argv) < 2:

202

sys.stderr.write("usage: revfile dump\n"

203

" revfile add\n")

204

sys.exit(1)

205

206

if argv[1] == 'add':

207

new_idx = r.add_full_text(sys.stdin.read())

208

print 'added idx %d' % new_idx

209

elif argv[1] == 'dump':

210

r.dump()

211

else:

212

sys.stderr.write("unknown command %r\n" % argv[1])

213

sys.exit(1)

214

215

216

if __name__ == '__main__':

217

import sys

218

main(sys.argv)

Older »