23
23
# TODO: Perhaps return more details on the file to avoid statting it
24
24
# again: nonexistent, file type, size, etc
26
# TODO: Perhaps use a Python pickle instead of a text file; might be faster.
30
28
CACHE_HEADER = "### bzr hashcache v5\n"
32
30
import os, stat, time
35
from bzrlib.osutils import sha_file, pathjoin, safe_unicode
32
from bzrlib.osutils import sha_file
36
33
from bzrlib.trace import mutter, warning
37
from bzrlib.atomicfile import AtomicFile
38
from bzrlib.errors import BzrError
37
def _fingerprint(abspath):
39
fs = os.lstat(abspath)
41
# might be missing, etc
44
if stat.S_ISDIR(fs.st_mode):
47
# we discard any high precision because it's not reliable; perhaps we
48
# could do better on some systems?
49
return (fs.st_size, long(fs.st_mtime),
50
long(fs.st_ctime), fs.st_ino, fs.st_dev)
47
53
class HashCache(object):
104
111
self.needs_write = True
108
116
"""Scan all files and remove entries where the cache entry is obsolete.
110
118
Obsolete entries are those where the file has been modified or deleted
111
119
since the entry was inserted.
113
# FIXME optimisation opportunity, on linux [and check other oses]:
114
# rather than iteritems order, stat in inode order.
115
121
prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]
118
124
for inum, path, cache_entry in prep:
119
abspath = pathjoin(self.root, path)
120
fp = self._fingerprint(abspath)
125
abspath = os.sep.join([self.basedir, path])
126
fp = _fingerprint(abspath)
121
127
self.stat_count += 1
123
129
cache_fp = cache_entry[1]
128
134
self.needs_write = True
129
135
del self._cache[path]
131
139
def get_sha1(self, path):
132
140
"""Return the sha1 of a file.
134
abspath = pathjoin(self.root, path)
142
abspath = os.sep.join([self.basedir, path])
135
143
self.stat_count += 1
136
file_fp = self._fingerprint(abspath)
144
file_fp = _fingerprint(abspath)
139
147
# not a regular file or not existing
149
157
cache_sha1, cache_fp = None, None
151
159
if cache_fp == file_fp:
152
## mutter("hashcache hit for %s %r -> %s", path, file_fp, cache_sha1)
153
## mutter("now = %s", time.time())
154
160
self.hit_count += 1
155
161
return cache_sha1
157
163
self.miss_count += 1
159
mode = file_fp[FP_MODE_COLUMN]
160
if stat.S_ISREG(mode):
161
digest = self._really_sha1_file(abspath)
162
elif stat.S_ISLNK(mode):
163
digest = sha.new(os.readlink(abspath)).hexdigest()
165
raise BzrError("file %r: unknown file stat mode: %o"%(abspath,mode))
167
# window of 3 seconds to allow for 2s resolution on windows,
168
# unsynchronized file servers, etc.
169
cutoff = self._cutoff_time()
170
if file_fp[FP_MTIME_COLUMN] >= cutoff \
171
or file_fp[FP_CTIME_COLUMN] >= cutoff:
164
digest = sha_file(file(abspath, 'rb', buffering=65000))
166
now = int(time.time())
167
if file_fp[1] >= now or file_fp[2] >= now:
172
168
# changed too recently; can't be cached. we can
173
169
# return the result and it could possibly be cached
176
# the point is that we only want to cache when we are sure that any
177
# subsequent modifications of the file can be detected. If a
178
# modification neither changes the inode, the device, the size, nor
179
# the mode, then we can only distinguish it by time; therefore we
180
# need to let sufficient time elapse before we may cache this entry
181
# again. If we didn't do this, then, for example, a very quick 1
182
# byte replacement in the file might go undetected.
183
## mutter('%r modified too recently; not caching', path)
184
self.danger_count += 1
171
self.danger_count += 1
186
173
self.removed_count += 1
187
174
self.needs_write = True
188
175
del self._cache[path]
190
## mutter('%r added to cache: now=%f, mtime=%d, ctime=%d',
191
## path, time.time(), file_fp[FP_MTIME_COLUMN],
192
## file_fp[FP_CTIME_COLUMN])
193
177
self.update_count += 1
194
178
self.needs_write = True
195
179
self._cache[path] = (digest, file_fp)
198
def _really_sha1_file(self, abspath):
199
"""Calculate the SHA1 of a file by reading the full text"""
200
return sha_file(file(abspath, 'rb', buffering=65000))
203
187
"""Write contents of cache to file."""
204
outf = AtomicFile(self.cache_file_name(), 'wb', new_mode=self._mode)
188
from atomicfile import AtomicFile
190
outf = AtomicFile(self.cache_file_name(), 'wb')
206
192
print >>outf, CACHE_HEADER,
214
200
print >>outf, "%d" % fld,
217
204
self.needs_write = False
218
## mutter("write hash cache: %s hits=%d misses=%d stat=%d recent=%d updates=%d",
219
## self.cache_file_name(), self.hit_count, self.miss_count,
221
## self.danger_count, self.update_count)
223
206
if not outf.closed:
227
212
"""Reinstate cache from file.
237
222
inf = file(fn, 'rb', buffering=65000)
238
223
except IOError, e:
239
mutter("failed to open %s: %s", fn, e)
240
# better write it now so it is valid
241
self.needs_write = True
224
mutter("failed to open %s: %s" % (fn, e))
244
228
hdr = inf.readline()
245
229
if hdr != CACHE_HEADER:
246
mutter('cache header marker not found at top of %s;'
247
' discarding cache', fn)
248
self.needs_write = True
230
mutter('cache header marker not found at top of %s; discarding cache'
271
254
self._cache[path] = (sha1, fp)
273
256
self.needs_write = False
275
def _cutoff_time(self):
276
"""Return cutoff time.
278
Files modified more recently than this time are at risk of being
279
undetectably modified and so can't be cached.
281
return int(time.time()) - 3
283
def _fingerprint(self, abspath):
285
fs = os.lstat(abspath)
287
# might be missing, etc
289
if stat.S_ISDIR(fs.st_mode):
291
# we discard any high precision because it's not reliable; perhaps we
292
# could do better on some systems?
293
return (fs.st_size, long(fs.st_mtime),
294
long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)