~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/statcache.py

Committer: Martin Pool
Date: 2005-07-07 10:13:43 UTC
mfrom: (0.1.95)
Revision ID: mbp@sourcefrog.net-20050707101342-aa2ef950004cb467

todo

files added:
bzrlib/statcache.py

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

files removed:
HACKING

Makefile

bzrlib/builtins.py

bzrlib/delta.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/intset.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/plugins/__init__.py

bzrlib/selftest/TestUtil.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testdiff.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/teststore.py

bzrlib/shellcomplete.py

bzrlib/ui.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/weave.py

bzrlib/weavefile.py

contrib/emacs

contrib/emacs/bzr-mode.el

doc/split-join-files.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/revfile.txt

notes/schemas.xml

patches/pending-merge.patch

tools

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/history2weaves.py

tools/http_client.py

tools/testweave.py

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files renamed:
bzrlib/util/effbot/ => effbot/

bzrlib/util/elementtree/ => elementtree/

bzrlib/plugins/ => plugins/

bzrlib/util/urlgrabber/ => urlgrabber/

files modified:
.bzrignore

NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/status.py

bzrlib/store.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/upgrade.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/pwk

doc/index.txt

doc/todo-from-arch.txt

setup.py

testsweet.py

Show diffs side-by-side

added added

removed removed

bzrlib/statcache.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import stat, os, sha, time

from trace import mutter

from errors import BzrError, BzrCheckError

"""File stat cache to speed up tree comparisons.

This module basically gives a quick way to find the SHA-1 and related

information of a file in the working directory, without actually

reading and hashing the whole file. The information is validated by

checking the size, mtime, ctime, etc of the file as returned by the

stat() system call.

This has no relation to the deprecated standard Python module called

statcache (vs bzrlib.statcache).

Implementation

==============

Users of this module should not need to know about how this is

implemented, and in particular should not depend on the particular

data which is stored or its format.

The cache maintains a mapping from filename to the SHA-1 of the

content of the file.

The cache also stores a fingerprint of (size, mtime, ctime, ino, dev)

which is used to validate that the entry is up-to-date.

This is done by maintaining a cache indexed by a file fingerprint of

(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the

fingerprint has changed, we assume the file content has not changed

either and the SHA-1 is therefore the same.

If any of the fingerprint fields have changed then the file content

*may* have changed, or it may not have. We need to reread the file

contents to make sure, but this is not visible to the user or

higher-level code (except as a delay of course).

The mtime and ctime are stored with nanosecond fields, but not all

filesystems give this level of precision. There is therefore a

possible race: the file might be modified twice within a second

without changing the size or mtime, and a SHA-1 cached from the first

version would be wrong. We handle this by not recording a cached hash

for any files which were modified in the current second and that

therefore have the chance to change again before the second is up.

The only known hole in this design is if the system clock jumps

backwards crossing invocations of bzr. Please don't do that; use ntp

to gradually adjust your clock or don't use bzr over the step.

At the moment this is stored in a simple textfile; it might be nice

to use a tdb instead to allow faster lookup by file-id.

The cache is represented as a map from file_id to a tuple of (file_id,

sha1, path, size, mtime, ctime, ino, dev).

The SHA-1 is stored in memory as a hexdigest.

This version of the file on disk has one line per record, and fields

separated by \0 records.

"""

# order of fields returned by fingerprint()

FP_SIZE = 0

FP_MTIME = 1

FP_CTIME = 2

FP_INO = 3

FP_DEV = 4

# order of fields in the statcache file and in the in-memory map

SC_FILE_ID = 0

SC_SHA1 = 1

SC_PATH = 2

SC_SIZE = 3

SC_MTIME = 4

SC_CTIME = 5

SC_INO = 6

100

SC_DEV = 7

101

102

103

104

CACHE_HEADER = "### bzr statcache v4"

105

106

107

def fingerprint(abspath):

108

try:

109

fs = os.lstat(abspath)

110

except OSError:

111

# might be missing, etc

112

return None

113

114

if stat.S_ISDIR(fs.st_mode):

115

return None

116

117

return (fs.st_size, fs.st_mtime,

118

fs.st_ctime, fs.st_ino, fs.st_dev)

119

120

121

122

def _write_cache(basedir, entries):

123

from atomicfile import AtomicFile

124

125

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

126

outf = AtomicFile(cachefn, 'wb')

127

try:

128

outf.write(CACHE_HEADER + '\n')

129

130

for entry in entries:

131

if len(entry) != 8:

132

raise ValueError("invalid statcache entry tuple %r" % entry)

133

outf.write(entry[0].encode('utf-8')) # file id

134

outf.write('\0')

135

outf.write(entry[1]) # hex sha1

136

outf.write('\0')

137

outf.write(entry[2].encode('utf-8')) # name

138

for nf in entry[3:]:

139

outf.write('\0%d' % nf)

140

outf.write('\n')

141

142

outf.commit()

143

finally:

144

if not outf.closed:

145

outf.abort()

146

147

148

def _try_write_cache(basedir, entries):

149

try:

150

return _write_cache(basedir, entries)

151

except IOError, e:

152

mutter("cannot update statcache in %s: %s" % (basedir, e))

153

except OSError, e:

154

mutter("cannot update statcache in %s: %s" % (basedir, e))

155

156

157

158

def load_cache(basedir):

159

import re

160

cache = {}

161

seen_paths = {}

162

from bzrlib.trace import warning

163

164

assert isinstance(basedir, basestring)

165

166

sha_re = re.compile(r'[a-f0-9]{40}')

167

168

try:

169

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

170

cachefile = open(cachefn, 'rb')

171

except IOError:

172

return cache

173

174

line1 = cachefile.readline().rstrip('\r\n')

175

if line1 != CACHE_HEADER:

176

mutter('cache header marker not found at top of %s; discarding cache'

177

% cachefn)

178

return cache

179

180

for l in cachefile:

181

f = l.split('\0')

182

183

file_id = f[0].decode('utf-8')

184

if file_id in cache:

185

warning("duplicated file_id in cache: {%s}" % file_id)

186

187

text_sha = f[1]

188

if len(text_sha) != 40 or not sha_re.match(text_sha):

189

raise BzrCheckError("invalid file SHA-1 in cache: %r" % text_sha)

190

191

path = f[2].decode('utf-8')

192

if path in seen_paths:

193

warning("duplicated path in cache: %r" % path)

194

seen_paths[path] = True

195

196

entry = (file_id, text_sha, path) + tuple([long(x) for x in f[3:]])

197

if len(entry) != 8:

198

raise ValueError("invalid statcache entry tuple %r" % entry)

199

200

cache[file_id] = entry

201

return cache

202

203

204

205

def _files_from_inventory(inv):

206

for path, ie in inv.iter_entries():

207

if ie.kind != 'file':

208

continue

209

yield ie.file_id, path

210

211

212

213

def update_cache(basedir, inv, flush=False):

214

"""Update and return the cache for the branch.

215

216

The returned cache may contain entries that have not been written

217

to disk for files recently touched.

218

219

flush -- discard any previous cache and recalculate from scratch.

220

"""

221

222

# load the existing cache; use information there to find a list of

223

# files ordered by inode, which is alleged to be the fastest order

224

# to stat the files.

225

226

to_update = _files_from_inventory(inv)

227

228

assert isinstance(flush, bool)

229

if flush:

230

cache = {}

231

else:

232

cache = load_cache(basedir)

233

234

by_inode = []

235

without_inode = []

236

for file_id, path in to_update:

237

if file_id in cache:

238

by_inode.append((cache[file_id][SC_INO], file_id, path))

239

else:

240

without_inode.append((file_id, path))

241

by_inode.sort()

242

243

to_update = [a[1:] for a in by_inode] + without_inode

244

245

stat_cnt = missing_cnt = new_cnt = hardcheck = change_cnt = 0

246

247

# dangerfiles have been recently touched and can't be committed to

248

# a persistent cache yet, but they are returned to the caller.

249

dangerfiles = []

250

251

now = int(time.time())

252

253

## mutter('update statcache under %r' % basedir)

254

for file_id, path in to_update:

255

abspath = os.path.join(basedir, path)

256

fp = fingerprint(abspath)

257

stat_cnt += 1

258

259

cacheentry = cache.get(file_id)

260

261

if fp == None: # not here

262

if cacheentry:

263

del cache[file_id]

264

change_cnt += 1

265

missing_cnt += 1

266

continue

267

elif not cacheentry:

268

new_cnt += 1

269

270

if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):

271

dangerfiles.append(file_id)

272

273

if cacheentry and (cacheentry[3:] == fp):

274

continue # all stat fields unchanged

275

276

hardcheck += 1

277

278

dig = sha.new(file(abspath, 'rb').read()).hexdigest()

279

280

# We update the cache even if the digest has not changed from

281

# last time we looked, so that the fingerprint fields will

282

# match in future.

283

cacheentry = (file_id, dig, path) + fp

284

cache[file_id] = cacheentry

285

change_cnt += 1

286

287

mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '

288

'%d deleted, %d new, '

289

'%d in cache'

290

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles),

291

missing_cnt, new_cnt, len(cache)))

292

293

if change_cnt:

294

mutter('updating on-disk statcache')

295

296

if dangerfiles:

297

safe_cache = cache.copy()

298

for file_id in dangerfiles:

299

del safe_cache[file_id]

300

else:

301

safe_cache = cache

302

303

_try_write_cache(basedir, safe_cache.itervalues())

304

305

return cache

Older »