~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/hashcache.py

Committer: John Arbash Meinel
Date: 2006-01-23 22:30:55 UTC
mfrom: (1185.50.51 bzr-jam-integration)
mto: This revision was merged to the branch mainline in revision 1551.
Revision ID: john@arbash-meinel.com-20060123223055-43e99c932f0c8945

[merge] jam-integration

files added:
.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

TODO

bzr_man.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/builtins.py

bzrlib/changeset.py

bzrlib/clone.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/delta.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/status.py

bzrlib/store

bzrlib/store/text.py

bzrlib/store/weave.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_fileid_involved.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_parent.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_remove.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_reweave.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_uncommit.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/textinv.py

bzrlib/transactions.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/ignore.txt

doc/quotes.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/split-join-files.txt

doc/switch-in-branch.txt

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/new-inventory-sample.xml

notes/revfile.txt

notes/schemas.xml

patches

tools

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
bzrlib/tests.py

doc/faq.txt

doc/quickref.txt

doc/roadmap.txt

doc/testing.txt

doc/work-order.txt

files renamed:
bzrlib/store.py => bzrlib/store/__init__.py

bzrlib/xml.py => bzrlib/xml_serializer.py

files modified:
.bzrignore

README

build-api

bzr *

bzrlib/__init__.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/inventory.py

bzrlib/osutils.py

bzrlib/revision.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

doc/Makefile

doc/bitkeeper.txt

doc/darcs.txt

doc/formats.txt

doc/index.txt

doc/interrupted.txt

doc/merge.txt

doc/purpose.txt

doc/python.txt

doc/random.txt

doc/svk.txt

doc/tagging.txt

doc/thanks.txt

doc/todo-from-arch.txt

notes/performance.txt

setup.py *

Show diffs side-by-side

added added

removed removed

bzrlib/hashcache.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

# TODO: Up-front, stat all files in order and remove those which are deleted or

# out-of-date. Don't actually re-read them until they're needed. That ought

# to bring all the inodes into core so that future stats to them are fast, and

# it preserves the nice property that any caller will always get up-to-date

# data except in unavoidable cases.

# TODO: Perhaps return more details on the file to avoid statting it

# again: nonexistent, file type, size, etc

# TODO: Perhaps use a Python pickle instead of a text file; might be faster.

CACHE_HEADER = "### bzr hashcache v5\n"

import os, stat, time

import sha

from bzrlib.osutils import sha_file, pathjoin

from bzrlib.trace import mutter, warning

from bzrlib.atomicfile import AtomicFile

from bzrlib.errors import BzrError

FP_MTIME_COLUMN = 1

FP_CTIME_COLUMN = 2

FP_MODE_COLUMN = 5

def _fingerprint(abspath):

try:

fs = os.lstat(abspath)

except OSError:

# might be missing, etc

return None

if stat.S_ISDIR(fs.st_mode):

return None

# we discard any high precision because it's not reliable; perhaps we

# could do better on some systems?

return (fs.st_size, long(fs.st_mtime),

long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)

class HashCache(object):

"""Cache for looking up file SHA-1.

Files are considered to match the cached value if the fingerprint

of the file has not changed. This includes its mtime, ctime,

device number, inode number, and size. This should catch

modifications or replacement of the file by a new one.

This may not catch modifications that do not change the file's

size and that occur within the resolution window of the

timestamps. To handle this we specifically do not cache files

which have changed since the start of the present second, since

they could undetectably change again.

This scheme may fail if the machine's clock steps backwards.

Don't do that.

This does not canonicalize the paths passed in; that should be

done by the caller.

_cache

Indexed by path, points to a two-tuple of the SHA-1 of the file.

and its fingerprint.

stat_count

number of times files have been statted

hit_count

number of times files have been retrieved from the cache, avoiding a

re-read

miss_count

number of misses (times files have been completely re-read)

"""

needs_write = False

def __init__(self, basedir):

self.basedir = basedir

self.hit_count = 0

100

self.miss_count = 0

101

self.stat_count = 0

102

self.danger_count = 0

103

self.removed_count = 0

104

self.update_count = 0

105

self._cache = {}

106

107

def cache_file_name(self):

108

# FIXME: duplicate path logic here, this should be

109

# something like 'branch.controlfile'.

110

return pathjoin(self.basedir, '.bzr', 'stat-cache')

111

112

def clear(self):

113

"""Discard all cached information.

114

115

This does not reset the counters."""

116

if self._cache:

117

self.needs_write = True

118

self._cache = {}

119

120

121

def scan(self):

122

"""Scan all files and remove entries where the cache entry is obsolete.

123

124

Obsolete entries are those where the file has been modified or deleted

125

since the entry was inserted.

126

"""

127

prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]

128

prep.sort()

129

130

for inum, path, cache_entry in prep:

131

abspath = pathjoin(self.basedir, path)

132

fp = _fingerprint(abspath)

133

self.stat_count += 1

134

135

cache_fp = cache_entry[1]

136

137

if (not fp) or (cache_fp != fp):

138

# not here or not a regular file anymore

139

self.removed_count += 1

140

self.needs_write = True

141

del self._cache[path]

142

143

144

def get_sha1(self, path):

145

"""Return the sha1 of a file.

146

"""

147

abspath = pathjoin(self.basedir, path)

148

self.stat_count += 1

149

file_fp = _fingerprint(abspath)

150

151

if not file_fp:

152

# not a regular file or not existing

153

if path in self._cache:

154

self.removed_count += 1

155

self.needs_write = True

156

del self._cache[path]

157

return None

158

159

if path in self._cache:

160

cache_sha1, cache_fp = self._cache[path]

161

else:

162

cache_sha1, cache_fp = None, None

163

164

if cache_fp == file_fp:

165

self.hit_count += 1

166

return cache_sha1

167

168

self.miss_count += 1

169

170

171

mode = file_fp[FP_MODE_COLUMN]

172

if stat.S_ISREG(mode):

173

digest = sha_file(file(abspath, 'rb', buffering=65000))

174

elif stat.S_ISLNK(mode):

175

digest = sha.new(os.readlink(abspath)).hexdigest()

176

else:

177

raise BzrError("file %r: unknown file stat mode: %o"%(abspath,mode))

178

179

now = int(time.time())

180

if file_fp[FP_MTIME_COLUMN] >= now or file_fp[FP_CTIME_COLUMN] >= now:

181

# changed too recently; can't be cached. we can

182

# return the result and it could possibly be cached

183

# next time.

184

185

# the point is that we only want to cache when we are sure that any

186

# subsequent modifications of the file can be detected. If a

187

# modification neither changes the inode, the device, the size, nor

188

# the mode, then we can only distinguish it by time; therefore we

189

# need to let sufficient time elapse before we may cache this entry

190

# again. If we didn't do this, then, for example, a very quick 1

191

# byte replacement in the file might go undetected.

192

self.danger_count += 1

193

if cache_fp:

194

self.removed_count += 1

195

self.needs_write = True

196

del self._cache[path]

197

else:

198

self.update_count += 1

199

self.needs_write = True

200

self._cache[path] = (digest, file_fp)

201

return digest

202

203

def write(self):

204

"""Write contents of cache to file."""

205

outf = AtomicFile(self.cache_file_name(), 'wb')

206

try:

207

print >>outf, CACHE_HEADER,

208

209

for path, c in self._cache.iteritems():

210

assert '//' not in path, path

211

outf.write(path.encode('utf-8'))

212

outf.write('// ')

213

print >>outf, c[0], # hex sha1

214

for fld in c[1]:

215

print >>outf, "%d" % fld,

216

print >>outf

217

218

outf.commit()

219

self.needs_write = False

220

finally:

221

if not outf.closed:

222

outf.abort()

223

224

def read(self):

225

"""Reinstate cache from file.

226

227

Overwrites existing cache.

228

229

If the cache file has the wrong version marker, this just clears

230

the cache."""

231

self._cache = {}

232

233

fn = self.cache_file_name()

234

try:

235

inf = file(fn, 'rb', buffering=65000)

236

except IOError, e:

237

mutter("failed to open %s: %s", fn, e)

238

# better write it now so it is valid

239

self.needs_write = True

240

return

241

242

243

hdr = inf.readline()

244

if hdr != CACHE_HEADER:

245

mutter('cache header marker not found at top of %s;'

246

' discarding cache', fn)

247

self.needs_write = True

248

return

249

250

for l in inf:

251

pos = l.index('// ')

252

path = l[:pos].decode('utf-8')

253

if path in self._cache:

254

warning('duplicated path %r in cache' % path)

255

continue

256

257

pos += 3

258

fields = l[pos:].split(' ')

259

if len(fields) != 7:

260

warning("bad line in hashcache: %r" % l)

261

continue

262

263

sha1 = fields[0]

264

if len(sha1) != 40:

265

warning("bad sha1 in hashcache: %r" % sha1)

266

continue

267

268

fp = tuple(map(long, fields[1:]))

269

270

self._cache[path] = (sha1, fp)

271

272

self.needs_write = False

273

274

275

276

Older »