~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/cache.py

Committer: Martin Pool
Date: 2005-05-09 06:09:42 UTC
Revision ID: mbp@sourcefrog.net-20050509060942-d9c9efd7feed0894

- more indicators at top of test output
- tidy up remotebranch stuff

files removed:
contrib/fortune

files renamed:
bzrlib/statcache.py => bzrlib/cache.py

files modified:
NEWS

TODO

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/commands.py

bzrlib/remotebranch.py

bzrlib/textui.py

Show diffs side-by-side

added added

removed removed

bzrlib/cache.py

from binascii import b2a_qp, a2b_qp

from trace import mutter

from errors import BzrError

"""File stat cache to speed up tree comparisons.

This module basically gives a quick way to find the SHA-1 and related

information of a file in the working directory, without actually

reading and hashing the whole file.

This is done by maintaining a cache indexed by a file fingerprint of

(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the

fingerprint has changed, we assume the file content has not changed

either and the SHA-1 is therefore the same.

If any of the fingerprint fields have changed then the file content

*may* have changed, or it may not have. We need to reread the file

contents to make sure, but this is not visible to the user or

higher-level code (except as a delay of course).

The mtime and ctime are stored with nanosecond fields, but not all

filesystems give this level of precision. There is therefore a

possible race: the file might be modified twice within a second

without changing the size or mtime, and a SHA-1 cached from the first

version would be wrong. We handle this by not recording a cached hash

for any files which were modified in the current second and that

therefore have the chance to change again before the second is up.

The only known hole in this design is if the system clock jumps

backwards crossing invocations of bzr. Please don't do that; use ntp

to gradually adjust your clock or don't use bzr over the step.

At the moment this is stored in a simple textfile; it might be nice

to use a tdb instead.

The cache is represented as a map from file_id to a tuple of (file_id,

sha1, path, size, mtime, ctime, ino, dev).

"""

FP_SIZE = 0

FP_MTIME = 1

FP_CTIME = 2

FP_INO = 3

FP_DEV = 4

SC_FILE_ID = 0

SC_SHA1 = 1

# file fingerprints are: (path, size, mtime, ctime, ino, dev).

# if this is the same for this file as in the previous revision, we

# assume the content is the same and the SHA-1 is the same.

# This is stored in a fingerprint file that also contains the file-id

# and the content SHA-1.

# Thus for any given file we can quickly get the SHA-1, either from

# the cache or if the cache is out of date.

# At the moment this is stored in a simple textfile; it might be nice

# to use a tdb instead.

# What we need:

# build a new cache from scratch

# load cache, incrementally update it

# TODO: Have a paranoid mode where we always compare the texts and

# always recalculate the digest, to trap modification without stat

# change and SHA collisions.

def fingerprint(path, abspath):

fs.st_ctime, fs.st_ino, fs.st_dev)

def _write_cache(branch, entry_iter, dangerfiles):

from atomicfile import AtomicFile

outf = AtomicFile(branch.controlfilename('stat-cache'), 'wb', 'utf-8')

try:

for entry in entry_iter:

if entry[0] in dangerfiles:

continue

outf.write(entry[0] + ' ' + entry[1] + ' ')

outf.write(b2a_qp(entry[2], True))

outf.write(' %d %d %d %d %d\n' % entry[3:])

def write_cache(branch, entry_iter):

outf = branch.controlfile('work-cache.tmp', 'wt')

for entry in entry_iter:

outf.write(entry[0] + ' ' + entry[1] + ' ')

outf.write(b2a_qp(entry[2], True))

outf.write(' %d %d %d %d %d\n' % entry[3:])

outf.close()

os.rename(branch.controlfilename('work-cache.tmp'),

branch.controlfilename('work-cache'))

outf.commit()

finally:

if not outf.closed:

100

outf.abort()

101

102

103

def load_cache(branch):

104

cache = {}

105

106

try:

107

cachefile = branch.controlfile('stat-cache', 'r')

cachefile = branch.controlfile('work-cache', 'rt')

108

except IOError:

109

return cache

110

126

yield ie.file_id, path

127

100

128

101

129

130

def update_cache(branch, inv=None, flush=False):

131

"""Update and return the cache for the branch.

132

133

The returned cache may contain entries that have not been written

134

to disk for files recently touched.

135

136

flush -- discard any previous cache and recalculate from scratch.

137

"""

138

102

def build_cache(branch):

103

inv = branch.read_working_inventory()

104

105

cache = {}

106

_update_cache_from_list(branch, cache, _files_from_inventory(inv))

139

107

108

109

110

def update_cache(branch, inv):

140

111

# TODO: It's supposed to be faster to stat the files in order by inum.

141

112

# We don't directly know the inum of the files of course but we do

142

113

# know where they were last sighted, so we can sort by that.

143

114

144

assert isinstance(flush, bool)

145

if flush:

146

cache = {}

147

else:

148

cache = load_cache(branch)

149

if inv == None:

150

inv = branch.read_working_inventory()

115

cache = load_cache(branch)

151

116

return _update_cache_from_list(branch, cache, _files_from_inventory(inv))

152

117

153

118

154

119

155

120

def _update_cache_from_list(branch, cache, to_update):

156

"""Update and return the cache for given files.

157

158

cache -- Previously cached values to be validated.

159

160

to_update -- Sequence of (file_id, path) pairs to check.

121

"""Update the cache to have info on the named files.

122

123

to_update is a sequence of (file_id, path) pairs.

161

124

"""

162

163

from sets import Set

164

165

125

hardcheck = dirty = 0

166

167

# files that have been recently touched and can't be

168

# committed to a persistent cache yet.

169

170

dangerfiles = Set()

171

now = int(time.time())

172

173

126

for file_id, path in to_update:

174

127

fap = branch.abspath(path)

175

128

fp = fingerprint(fap, path)

181

134

dirty += 1

182

135

continue

183

136

184

if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):

185

dangerfiles.add(file_id)

186

187

137

if cacheentry and (cacheentry[3:] == fp):

188

138

continue # all stat fields unchanged

189

139

198

148

cache[file_id] = cacheentry

199

149

dirty += 1

200

150

201

mutter('statcache: read %d files, %d changed, %d dangerous, '

202

'%d in cache'

203

% (hardcheck, dirty, len(dangerfiles), len(cache)))

151

mutter('work cache: read %d files, %d changed' % (hardcheck, dirty))

204

152

205

153

if dirty:

206

mutter('updating on-disk statcache')

207

_write_cache(branch, cache.itervalues(), dangerfiles)

154

write_cache(branch, cache.itervalues())

208

155

209

156

return cache

Older »