~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/statcache.py

Committer: Martin Pool
Date: 2005-06-10 06:29:35 UTC
Revision ID: mbp@sourcefrog.net-20050610062935-cd2fc37ca7ae1e09

- split out proposed progress module

files added:
bzr-man.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/lock.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/patch.py

bzrlib/selftest.py

bzrlib/whitebox.py

contrib/bash/bzr

contrib/create_bzr_rollup.py

contrib/upload-bzr.dev

patches

patches/annotate3.patch

patches/find-touching-from-seq.diff

patches/progress.diff

patches/symlink-support.patch

files removed:
bzrlib/tests.py

files renamed:
contrib/bash/bzr => contrib/bash/bzr.simple

files modified:
.bzrignore

NEWS

TODO

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/log.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/statcache.py

bzrlib/status.py

bzrlib/store.py

bzrlib/tree.py

bzrlib/workingtree.py

contrib/add-bzr-to-baz

doc/index.txt

doc/tagging.txt

testbzr

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/statcache.py

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import stat, os, sha, time

from binascii import b2a_qp, a2b_qp

from trace import mutter

from errors import BzrError

from errors import BzrError, BzrCheckError

"""File stat cache to speed up tree comparisons.

information of a file in the working directory, without actually

reading and hashing the whole file.

Implementation

==============

Users of this module should not need to know about how this is

implemented, and in particular should not depend on the particular

data which is stored or its format.

This is done by maintaining a cache indexed by a file fingerprint of

(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the

fingerprint has changed, we assume the file content has not changed

to gradually adjust your clock or don't use bzr over the step.

At the moment this is stored in a simple textfile; it might be nice

to use a tdb instead.

to use a tdb instead to allow faster lookup by file-id.

The cache is represented as a map from file_id to a tuple of (file_id,

sha1, path, size, mtime, ctime, ino, dev).

The SHA-1 is stored in memory as a hexdigest.

This version of the file on disk has one line per record, and fields

separated by \0 records.

"""

# order of fields returned by fingerprint()

FP_SIZE = 0

FP_MTIME = 1

FP_CTIME = 2

FP_INO = 3

FP_DEV = 4

# order of fields in the statcache file and in the in-memory map

SC_FILE_ID = 0

SC_SHA1 = 1

SC_PATH = 2

SC_SIZE = 3

SC_MTIME = 4

SC_CTIME = 5

SC_INO = 6

SC_DEV = 7

CACHE_HEADER = "### bzr statcache v4"

def fingerprint(abspath):

105

fs.st_ctime, fs.st_ino, fs.st_dev)

106

107

def _write_cache(basedir, entry_iter, dangerfiles):

108

109

def _write_cache(basedir, entries):

110

from atomicfile import AtomicFile

111

112

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

outf = AtomicFile(cachefn, 'wb', 'utf-8')

113

outf = AtomicFile(cachefn, 'wb')

114

try:

for entry in entry_iter:

if entry[0] in dangerfiles:

continue

outf.write(entry[0] + ' ' + entry[1] + ' ')

outf.write(b2a_qp(entry[2], True))

outf.write(' %d %d %d %d %d\n' % entry[3:])

115

outf.write(CACHE_HEADER + '\n')

116

117

for entry in entries:

118

if len(entry) != 8:

119

raise ValueError("invalid statcache entry tuple %r" % entry)

120

outf.write(entry[0].encode('utf-8')) # file id

121

outf.write('\0')

122

outf.write(entry[1]) # hex sha1

123

outf.write('\0')

124

outf.write(entry[2].encode('utf-8')) # name

125

for nf in entry[3:]:

126

outf.write('\0%d' % nf)

127

outf.write('\n')

128

129

outf.commit()

130

finally:

100

131

if not outf.closed:

101

132

outf.abort()

133

134

135

def _try_write_cache(basedir, entries):

136

try:

137

return _write_cache(basedir, entries)

138

except IOError, e:

139

mutter("cannot update statcache in %s: %s" % (basedir, e))

140

except OSError, e:

141

mutter("cannot update statcache in %s: %s" % (basedir, e))

142

102

143

103

144

104

145

def load_cache(basedir):

105

import codecs

106

146

import re

107

147

cache = {}

148

seen_paths = {}

149

from bzrlib.trace import warning

150

151

assert isinstance(basedir, basestring)

152

153

sha_re = re.compile(r'[a-f0-9]{40}')

108

154

109

155

try:

110

156

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

111

cachefile = codecs.open(cachefn, 'r', 'utf-8')

157

cachefile = open(cachefn, 'rb')

112

158

except IOError:

113

159

return cache

114

160

161

line1 = cachefile.readline().rstrip('\r\n')

162

if line1 != CACHE_HEADER:

163

mutter('cache header marker not found at top of %s; discarding cache'

164

% cachefn)

165

return cache

166

115

167

for l in cachefile:

116

f = l.split(' ')

117

file_id = f[0]

168

f = l.split('\0')

169

170

file_id = f[0].decode('utf-8')

118

171

if file_id in cache:

119

raise BzrError("duplicated file_id in cache: {%s}" % file_id)

120

cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])

172

warning("duplicated file_id in cache: {%s}" % file_id)

173

174

text_sha = f[1]

175

if len(text_sha) != 40 or not sha_re.match(text_sha):

176

raise BzrCheckError("invalid file SHA-1 in cache: %r" % text_sha)

177

178

path = f[2].decode('utf-8')

179

if path in seen_paths:

180

warning("duplicated path in cache: %r" % path)

181

seen_paths[path] = True

182

183

entry = (file_id, text_sha, path) + tuple([long(x) for x in f[3:]])

184

if len(entry) != 8:

185

raise ValueError("invalid statcache entry tuple %r" % entry)

186

187

cache[file_id] = entry

121

188

return cache

122

189

123

190

124

191

125

126

192

def _files_from_inventory(inv):

127

193

for path, ie in inv.iter_entries():

128

194

if ie.kind != 'file':

140

206

flush -- discard any previous cache and recalculate from scratch.

141

207

"""

142

208

209

# load the existing cache; use information there to find a list of

210

# files ordered by inode, which is alleged to be the fastest order

211

# to stat the files.

143

212

144

# TODO: It's supposed to be faster to stat the files in order by inum.

145

# We don't directly know the inum of the files of course but we do

146

# know where they were last sighted, so we can sort by that.

213

to_update = _files_from_inventory(inv)

147

214

148

215

assert isinstance(flush, bool)

149

216

if flush:

150

217

cache = {}

151

218

else:

152

219

cache = load_cache(basedir)

153

return _update_cache_from_list(basedir, cache, _files_from_inventory(inv))

154

155

156

157

def _update_cache_from_list(basedir, cache, to_update):

158

"""Update and return the cache for given files.

159

160

cache -- Previously cached values to be validated.

161

162

to_update -- Sequence of (file_id, path) pairs to check.

163

"""

164

165

from sets import Set

166

167

stat_cnt = missing_cnt = hardcheck = change_cnt = 0

168

169

# files that have been recently touched and can't be

170

# committed to a persistent cache yet.

220

221

by_inode = []

222

without_inode = []

223

for file_id, path in to_update:

224

if file_id in cache:

225

by_inode.append((cache[file_id][SC_INO], file_id, path))

226

else:

227

without_inode.append((file_id, path))

228

by_inode.sort()

229

230

to_update = [a[1:] for a in by_inode] + without_inode

231

232

stat_cnt = missing_cnt = new_cnt = hardcheck = change_cnt = 0

233

234

# dangerfiles have been recently touched and can't be committed to

235

# a persistent cache yet, but they are returned to the caller.

236

dangerfiles = []

171

237

172

dangerfiles = Set()

173

238

now = int(time.time())

174

239

175

240

## mutter('update statcache under %r' % basedir)

186

251

change_cnt += 1

187

252

missing_cnt += 1

188

253

continue

254

elif not cacheentry:

255

new_cnt += 1

189

256

190

257

if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):

191

dangerfiles.add(file_id)

258

dangerfiles.append(file_id)

192

259

193

260

if cacheentry and (cacheentry[3:] == fp):

194

261

continue # all stat fields unchanged

197

264

198

265

dig = sha.new(file(abspath, 'rb').read()).hexdigest()

199

266

200

if cacheentry == None or dig != cacheentry[1]:

201

# if there was no previous entry for this file, or if the

202

# SHA has changed, then update the cache

203

cacheentry = (file_id, dig, path) + fp

204

cache[file_id] = cacheentry

205

change_cnt += 1

267

# We update the cache even if the digest has not changed from

268

# last time we looked, so that the fingerprint fields will

269

# match in future.

270

cacheentry = (file_id, dig, path) + fp

271

cache[file_id] = cacheentry

272

change_cnt += 1

206

273

207

274

mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '

275

'%d deleted, %d new, '

208

276

'%d in cache'

209

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles), len(cache)))

277

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles),

278

missing_cnt, new_cnt, len(cache)))

210

279

211

280

if change_cnt:

212

281

mutter('updating on-disk statcache')

213

_write_cache(basedir, cache.itervalues(), dangerfiles)

282

283

if dangerfiles:

284

safe_cache = cache.copy()

285

for file_id in dangerfiles:

286

del safe_cache[file_id]

287

else:

288

safe_cache = cache

289

290

_try_write_cache(basedir, safe_cache.itervalues())

214

291

215

292

return cache

Older »