~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/statcache.py

Committer: Martin Pool
Date: 2005-05-11 06:18:45 UTC
Revision ID: mbp@sourcefrog.net-20050511061845-f8cd99b4312e72f8

- Optional branch parameter to info command

files added:
bzrlib/tests.py

files removed:
bzr-man.py

bzrlib/changeset.py

bzrlib/commit.py

bzrlib/lock.py

bzrlib/merge.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/selftest

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/plugins.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/upgrade.py

contrib/bash/bzr

contrib/create_bzr_rollup.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

effbot

effbot/__init__.py

effbot/org

effbot/org/__init__.py

effbot/org/gzip_consumer.py

effbot/org/http_client.py

effbot/org/http_manager.py

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

plugins/rsync

plugins/rsync/__init__.py

plugins/rsync/rsync_update.py

files renamed:
contrib/bash/bzr.simple => contrib/bash/bzr

files modified:
.bzrignore

NEWS

TODO

build-api

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/log.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/statcache.py

bzrlib/status.py

bzrlib/store.py

bzrlib/tree.py

bzrlib/workingtree.py

contrib/add-bzr-to-baz

doc/formats.txt

doc/index.txt

doc/tagging.txt

testbzr

urlgrabber/keepalive.py

Show diffs side-by-side

added added

removed removed

bzrlib/statcache.py

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import stat, os, sha, time

from binascii import b2a_qp, a2b_qp

from trace import mutter

from errors import BzrError, BzrCheckError

from errors import BzrError

"""File stat cache to speed up tree comparisons.

information of a file in the working directory, without actually

reading and hashing the whole file.

Implementation

==============

Users of this module should not need to know about how this is

implemented, and in particular should not depend on the particular

data which is stored or its format.

This is done by maintaining a cache indexed by a file fingerprint of

(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the

fingerprint has changed, we assume the file content has not changed

to gradually adjust your clock or don't use bzr over the step.

At the moment this is stored in a simple textfile; it might be nice

to use a tdb instead to allow faster lookup by file-id.

to use a tdb instead.

The cache is represented as a map from file_id to a tuple of (file_id,

sha1, path, size, mtime, ctime, ino, dev).

The SHA-1 is stored in memory as a hexdigest.

This version of the file on disk has one line per record, and fields

separated by \0 records.

"""

# order of fields returned by fingerprint()

FP_SIZE = 0

FP_MTIME = 1

FP_CTIME = 2

FP_INO = 3

FP_DEV = 4

# order of fields in the statcache file and in the in-memory map

SC_FILE_ID = 0

SC_SHA1 = 1

SC_PATH = 2

SC_SIZE = 3

SC_MTIME = 4

SC_CTIME = 5

SC_INO = 6

SC_DEV = 7

CACHE_HEADER = "### bzr statcache v4"

SC_SHA1 = 1

def fingerprint(abspath):

105

fs.st_ctime, fs.st_ino, fs.st_dev)

106

107

108

109

def _write_cache(basedir, entries):

def _write_cache(basedir, entry_iter, dangerfiles):

110

from atomicfile import AtomicFile

111

112

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

113

outf = AtomicFile(cachefn, 'wb')

outf = AtomicFile(cachefn, 'wb', 'utf-8')

114

try:

115

outf.write(CACHE_HEADER + '\n')

116

117

for entry in entries:

118

if len(entry) != 8:

119

raise ValueError("invalid statcache entry tuple %r" % entry)

120

outf.write(entry[0].encode('utf-8')) # file id

121

outf.write('\0')

122

outf.write(entry[1]) # hex sha1

123

outf.write('\0')

124

outf.write(entry[2].encode('utf-8')) # name

125

for nf in entry[3:]:

126

outf.write('\0%d' % nf)

127

outf.write('\n')

for entry in entry_iter:

if entry[0] in dangerfiles:

continue

outf.write(entry[0] + ' ' + entry[1] + ' ')

outf.write(b2a_qp(entry[2], True))

outf.write(' %d %d %d %d %d\n' % entry[3:])

128

129

outf.commit()

130

finally:

131

100

if not outf.closed:

132

101

outf.abort()

133

134

135

def _try_write_cache(basedir, entries):

136

try:

137

return _write_cache(basedir, entries)

138

except IOError, e:

139

mutter("cannot update statcache in %s: %s" % (basedir, e))

140

except OSError, e:

141

mutter("cannot update statcache in %s: %s" % (basedir, e))

142

143

102

144

103

145

104

def load_cache(basedir):

146

import re

105

import codecs

106

147

107

cache = {}

148

seen_paths = {}

149

from bzrlib.trace import warning

150

151

assert isinstance(basedir, basestring)

152

153

sha_re = re.compile(r'[a-f0-9]{40}')

154

108

155

109

try:

156

110

cachefn = os.path.join(basedir, '.bzr', 'stat-cache')

157

cachefile = open(cachefn, 'rb')

111

cachefile = codecs.open(cachefn, 'r', 'utf-8')

158

112

except IOError:

159

113

return cache

160

161

line1 = cachefile.readline().rstrip('\r\n')

162

if line1 != CACHE_HEADER:

163

mutter('cache header marker not found at top of %s; discarding cache'

164

% cachefn)

165

return cache

166

114

167

115

for l in cachefile:

168

f = l.split('\0')

169

170

file_id = f[0].decode('utf-8')

116

f = l.split(' ')

117

file_id = f[0]

171

118

if file_id in cache:

172

warning("duplicated file_id in cache: {%s}" % file_id)

173

174

text_sha = f[1]

175

if len(text_sha) != 40 or not sha_re.match(text_sha):

176

raise BzrCheckError("invalid file SHA-1 in cache: %r" % text_sha)

177

178

path = f[2].decode('utf-8')

179

if path in seen_paths:

180

warning("duplicated path in cache: %r" % path)

181

seen_paths[path] = True

182

183

entry = (file_id, text_sha, path) + tuple([long(x) for x in f[3:]])

184

if len(entry) != 8:

185

raise ValueError("invalid statcache entry tuple %r" % entry)

186

187

cache[file_id] = entry

119

raise BzrError("duplicated file_id in cache: {%s}" % file_id)

120

cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])

188

121

return cache

189

122

190

123

191

124

125

192

126

def _files_from_inventory(inv):

193

127

for path, ie in inv.iter_entries():

194

128

if ie.kind != 'file':

206

140

flush -- discard any previous cache and recalculate from scratch.

207

141

"""

208

142

209

# load the existing cache; use information there to find a list of

210

# files ordered by inode, which is alleged to be the fastest order

211

# to stat the files.

212

143

213

to_update = _files_from_inventory(inv)

144

# TODO: It's supposed to be faster to stat the files in order by inum.

145

# We don't directly know the inum of the files of course but we do

146

# know where they were last sighted, so we can sort by that.

214

147

215

148

assert isinstance(flush, bool)

216

149

if flush:

217

150

cache = {}

218

151

else:

219

152

cache = load_cache(basedir)

220

221

by_inode = []

222

without_inode = []

223

for file_id, path in to_update:

224

if file_id in cache:

225

by_inode.append((cache[file_id][SC_INO], file_id, path))

226

else:

227

without_inode.append((file_id, path))

228

by_inode.sort()

229

230

to_update = [a[1:] for a in by_inode] + without_inode

231

232

stat_cnt = missing_cnt = new_cnt = hardcheck = change_cnt = 0

233

234

# dangerfiles have been recently touched and can't be committed to

235

# a persistent cache yet, but they are returned to the caller.

236

dangerfiles = []

153

return _update_cache_from_list(basedir, cache, _files_from_inventory(inv))

154

155

156

157

def _update_cache_from_list(basedir, cache, to_update):

158

"""Update and return the cache for given files.

159

160

cache -- Previously cached values to be validated.

161

162

to_update -- Sequence of (file_id, path) pairs to check.

163

"""

164

165

from sets import Set

166

167

stat_cnt = missing_cnt = hardcheck = change_cnt = 0

168

169

# files that have been recently touched and can't be

170

# committed to a persistent cache yet.

237

171

172

dangerfiles = Set()

238

173

now = int(time.time())

239

174

240

175

## mutter('update statcache under %r' % basedir)

251

186

change_cnt += 1

252

187

missing_cnt += 1

253

188

continue

254

elif not cacheentry:

255

new_cnt += 1

256

189

257

190

if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):

258

dangerfiles.append(file_id)

191

dangerfiles.add(file_id)

259

192

260

193

if cacheentry and (cacheentry[3:] == fp):

261

194

continue # all stat fields unchanged

264

197

265

198

dig = sha.new(file(abspath, 'rb').read()).hexdigest()

266

199

267

# We update the cache even if the digest has not changed from

268

# last time we looked, so that the fingerprint fields will

269

# match in future.

270

cacheentry = (file_id, dig, path) + fp

271

cache[file_id] = cacheentry

272

change_cnt += 1

200

if cacheentry == None or dig != cacheentry[1]:

201

# if there was no previous entry for this file, or if the

202

# SHA has changed, then update the cache

203

cacheentry = (file_id, dig, path) + fp

204

cache[file_id] = cacheentry

205

change_cnt += 1

273

206

274

207

mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '

275

'%d deleted, %d new, '

276

208

'%d in cache'

277

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles),

278

missing_cnt, new_cnt, len(cache)))

209

% (stat_cnt, hardcheck, change_cnt, len(dangerfiles), len(cache)))

279

210

280

211

if change_cnt:

281

212

mutter('updating on-disk statcache')

282

283

if dangerfiles:

284

safe_cache = cache.copy()

285

for file_id in dangerfiles:

286

del safe_cache[file_id]

287

else:

288

safe_cache = cache

289

290

_try_write_cache(basedir, safe_cache.itervalues())

213

_write_cache(basedir, cache.itervalues(), dangerfiles)

291

214

292

215

return cache

Older »