~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/statcache.py

Committer: Martin Pool
Date: 2005-05-11 06:18:45 UTC
Revision ID: mbp@sourcefrog.net-20050511061845-f8cd99b4312e72f8

- Optional branch parameter to info command

files added:
bzrlib/statcache.py

bzrlib/tests.py

files removed:
HACKING

Makefile

bzr-man.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/hashcache.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/missing.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/plugins/checkperms

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/weave.py

bzrlib/weavefile.py

contrib/bash/bzr

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

doc/split-join-files.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

testsweet.py

tools

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/history2weaves.py

tools/http_client.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
contrib/bash/bzr.simple => contrib/bash/bzr

bzrlib/util/elementtree/ => elementtree/

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

NEWS

README

TODO

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/store.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/add-bzr-to-baz

doc/formats.txt

doc/index.txt

doc/tagging.txt

setup.py

testbzr

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/statcache.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import stat, os, sha, time

from binascii import b2a_qp, a2b_qp

from trace import mutter

from errors import BzrError

"""File stat cache to speed up tree comparisons.

This module basically gives a quick way to find the SHA-1 and related

information of a file in the working directory, without actually

reading and hashing the whole file.

This is done by maintaining a cache indexed by a file fingerprint of

(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the

fingerprint has changed, we assume the file content has not changed

either and the SHA-1 is therefore the same.

If any of the fingerprint fields have changed then the file content

*may* have changed, or it may not have. We need to reread the file

contents to make sure, but this is not visible to the user or

higher-level code (except as a delay of course).

The mtime and ctime are stored with nanosecond fields, but not all

filesystems give this level of precision. There is therefore a

possible race: the file might be modified twice within a second

without changing the size or mtime, and a SHA-1 cached from the first

version would be wrong. We handle this by not recording a cached hash

for any files which were modified in the current second and that

therefore have the chance to change again before the second is up.

The only known hole in this design is if the system clock jumps

backwards crossing invocations of bzr. Please don't do that; use ntp

to gradually adjust your clock or don't use bzr over the step.

At the moment this is stored in a simple textfile; it might be nice

to use a tdb instead.

The cache is represented as a map from file_id to a tuple of (file_id,

sha1, path, size, mtime, ctime, ino, dev).

"""

FP_SIZE = 0

FP_MTIME = 1

FP_CTIME = 2

FP_INO = 3

FP_DEV = 4

SC_FILE_ID = 0

SC_SHA1 = 1

def fingerprint(abspath):

try:

fs = os.lstat(abspath)

except OSError:

# might be missing, etc

return None

if stat.S_ISDIR(fs.st_mode):

return None

return (fs.st_size, fs.st_mtime,

fs.st_ctime, fs.st_ino, fs.st_dev)

def _write_cache(basedir, entry_iter, dangerfiles):

from atomicfile import AtomicFile

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

outf = AtomicFile(cachefn, 'wb', 'utf-8')

try:

for entry in entry_iter:

if entry[0] in dangerfiles:

continue

outf.write(entry[0] + ' ' + entry[1] + ' ')

outf.write(b2a_qp(entry[2], True))

outf.write(' %d %d %d %d %d\n' % entry[3:])

outf.commit()

finally:

100

if not outf.closed:

101

outf.abort()

102

103

104

def load_cache(basedir):

105

import codecs

106

107

cache = {}

108

109

try:

110

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

111

cachefile = codecs.open(cachefn, 'r', 'utf-8')

112

except IOError:

113

return cache

114

115

for l in cachefile:

116

f = l.split(' ')

117

file_id = f[0]

118

if file_id in cache:

119

raise BzrError("duplicated file_id in cache: {%s}" % file_id)

120

cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])

121

return cache

122

123

124

125

126

def _files_from_inventory(inv):

127

for path, ie in inv.iter_entries():

128

if ie.kind != 'file':

129

continue

130

yield ie.file_id, path

131

132

133

134

def update_cache(basedir, inv, flush=False):

135

"""Update and return the cache for the branch.

136

137

The returned cache may contain entries that have not been written

138

to disk for files recently touched.

139

140

flush -- discard any previous cache and recalculate from scratch.

141

"""

142

143

144

# TODO: It's supposed to be faster to stat the files in order by inum.

145

# We don't directly know the inum of the files of course but we do

146

# know where they were last sighted, so we can sort by that.

147

148

assert isinstance(flush, bool)

149

if flush:

150

cache = {}

151

else:

152

cache = load_cache(basedir)

153

return _update_cache_from_list(basedir, cache, _files_from_inventory(inv))

154

155

156

157

def _update_cache_from_list(basedir, cache, to_update):

158

"""Update and return the cache for given files.

159

160

cache -- Previously cached values to be validated.

161

162

to_update -- Sequence of (file_id, path) pairs to check.

163

"""

164

165

from sets import Set

166

167

stat_cnt = missing_cnt = hardcheck = change_cnt = 0

168

169

# files that have been recently touched and can't be

170

# committed to a persistent cache yet.

171

172

dangerfiles = Set()

173

now = int(time.time())

174

175

## mutter('update statcache under %r' % basedir)

176

for file_id, path in to_update:

177

abspath = os.path.join(basedir, path)

178

fp = fingerprint(abspath)

179

stat_cnt += 1

180

181

cacheentry = cache.get(file_id)

182

183

if fp == None: # not here

184

if cacheentry:

185

del cache[file_id]

186

change_cnt += 1

187

missing_cnt += 1

188

continue

189

190

if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):

191

dangerfiles.add(file_id)

192

193

if cacheentry and (cacheentry[3:] == fp):

194

continue # all stat fields unchanged

195

196

hardcheck += 1

197

198

dig = sha.new(file(abspath, 'rb').read()).hexdigest()

199

200

if cacheentry == None or dig != cacheentry[1]:

201

# if there was no previous entry for this file, or if the

202

# SHA has changed, then update the cache

203

cacheentry = (file_id, dig, path) + fp

204

cache[file_id] = cacheentry

205

change_cnt += 1

206

207

mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '

208

'%d in cache'

209

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles), len(cache)))

210

211

if change_cnt:

212

mutter('updating on-disk statcache')

213

_write_cache(basedir, cache.itervalues(), dangerfiles)

214

215

return cache

Older »