~bzr-pqm/bzr/bzr.dev : contents of bzrlib/hashcache.py at revision 1403

~bzr-pqm/bzr/bzr.dev : (revision 1403)

846 by Martin Pool - start adding refactored/simplified hash cache	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
953 by Martin Pool - refactor imports and stats for hashcache	17	# TODO: Up-front, stat all files in order and remove those which are deleted or
	18	# out-of-date. Don't actually re-read them until they're needed. That ought
	19	# to bring all the inodes into core so that future stats to them are fast, and
	20	# it preserves the nice property that any caller will always get up-to-date
	21	# data except in unavoidable cases.
864 by Martin Pool doc	22
	23	# TODO: Perhaps return more details on the file to avoid statting it
	24	# again: nonexistent, file type, size, etc
	25
1213 by Martin Pool - move import in hashcache	26	# TODO: Perhaps use a Python pickle instead of a text file; might be faster.
1213 by Martin Pool - move import in hashcache	27
864 by Martin Pool doc	28
864 by Martin Pool doc	29
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	30	CACHE_HEADER = "### bzr hashcache v5\n"
859 by Martin Pool - add HashCache.write and a simple test for it	31
953 by Martin Pool - refactor imports and stats for hashcache	32	import os, stat, time
1092.2.6 by Robert Collins symlink support updated to work	33	import sha
953 by Martin Pool - refactor imports and stats for hashcache	34
	35	from bzrlib.osutils import sha_file
	36	from bzrlib.trace import mutter, warning
1213 by Martin Pool - move import in hashcache	37	from bzrlib.atomicfile import AtomicFile
1213 by Martin Pool - move import in hashcache	38
953 by Martin Pool - refactor imports and stats for hashcache	39
1092.2.6 by Robert Collins symlink support updated to work	40	FP_MODE_COLUMN = 5
859 by Martin Pool - add HashCache.write and a simple test for it	41
846 by Martin Pool - start adding refactored/simplified hash cache	42	def _fingerprint(abspath):
	43	try:
	44	fs = os.lstat(abspath)
	45	except OSError:
	46	# might be missing, etc
	47	return None
	48
	49	if stat.S_ISDIR(fs.st_mode):
	50	return None
	51
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	52	# we discard any high precision because it's not reliable; perhaps we
	53	# could do better on some systems?
	54	return (fs.st_size, long(fs.st_mtime),
1092.2.6 by Robert Collins symlink support updated to work	55	long(fs.st_ctime), fs.st_ino, fs.st_dev, fs.st_mode)
846 by Martin Pool - start adding refactored/simplified hash cache	56
	57
	58	class HashCache(object):
	59	"""Cache for looking up file SHA-1.
	60
	61	Files are considered to match the cached value if the fingerprint
	62	of the file has not changed. This includes its mtime, ctime,
	63	device number, inode number, and size. This should catch
	64	modifications or replacement of the file by a new one.
	65
	66	This may not catch modifications that do not change the file's
	67	size and that occur within the resolution window of the
	68	timestamps. To handle this we specifically do not cache files
	69	which have changed since the start of the present second, since
	70	they could undetectably change again.
	71
	72	This scheme may fail if the machine's clock steps backwards.
	73	Don't do that.
	74
	75	This does not canonicalize the paths passed in; that should be
	76	done by the caller.
	77
860 by Martin Pool - refactor hashcache to use just one dictionary	78	_cache
	79	Indexed by path, points to a two-tuple of the SHA-1 of the file.
	80	and its fingerprint.
846 by Martin Pool - start adding refactored/simplified hash cache	81
	82	stat_count
	83	number of times files have been statted
	84
	85	hit_count
	86	number of times files have been retrieved from the cache, avoiding a
	87	re-read
	88
	89	miss_count
	90	number of misses (times files have been completely re-read)
	91	"""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	92	needs_write = False
	93
846 by Martin Pool - start adding refactored/simplified hash cache	94	def __init__(self, basedir):
	95	self.basedir = basedir
	96	self.hit_count = 0
	97	self.miss_count = 0
	98	self.stat_count = 0
	99	self.danger_count = 0
953 by Martin Pool - refactor imports and stats for hashcache	100	self.removed_count = 0
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	101	self.update_count = 0
860 by Martin Pool - refactor hashcache to use just one dictionary	102	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	103
	104
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	105	def cache_file_name(self):
953 by Martin Pool - refactor imports and stats for hashcache	106	return os.sep.join([self.basedir, '.bzr', 'stat-cache'])
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	107
	108
	109
	110
846 by Martin Pool - start adding refactored/simplified hash cache	111	def clear(self):
860 by Martin Pool - refactor hashcache to use just one dictionary	112	"""Discard all cached information.
	113
	114	This does not reset the counters."""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	115	if self._cache:
	116	self.needs_write = True
	117	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	118
	119
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	120	def scan(self):
	121	"""Scan all files and remove entries where the cache entry is obsolete.
	122
	123	Obsolete entries are those where the file has been modified or deleted
	124	since the entry was inserted.
	125	"""
	126	prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]
953 by Martin Pool - refactor imports and stats for hashcache	127	prep.sort()
	128
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	129	for inum, path, cache_entry in prep:
	130	abspath = os.sep.join([self.basedir, path])
	131	fp = _fingerprint(abspath)
	132	self.stat_count += 1
	133
	134	cache_fp = cache_entry[1]
	135
	136	if (not fp) or (cache_fp != fp):
	137	# not here or not a regular file anymore
	138	self.removed_count += 1
	139	self.needs_write = True
	140	del self._cache[path]
	141
953 by Martin Pool - refactor imports and stats for hashcache	142
846 by Martin Pool - start adding refactored/simplified hash cache	143	def get_sha1(self, path):
953 by Martin Pool - refactor imports and stats for hashcache	144	"""Return the sha1 of a file.
846 by Martin Pool - start adding refactored/simplified hash cache	145	"""
953 by Martin Pool - refactor imports and stats for hashcache	146	abspath = os.sep.join([self.basedir, path])
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	147	self.stat_count += 1
	148	file_fp = _fingerprint(abspath)
	149
	150	if not file_fp:
	151	# not a regular file or not existing
	152	if path in self._cache:
	153	self.removed_count += 1
	154	self.needs_write = True
	155	del self._cache[path]
	156	return None
953 by Martin Pool - refactor imports and stats for hashcache	157
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	158	if path in self._cache:
	159	cache_sha1, cache_fp = self._cache[path]
860 by Martin Pool - refactor hashcache to use just one dictionary	160	else:
	161	cache_sha1, cache_fp = None, None
846 by Martin Pool - start adding refactored/simplified hash cache	162
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	163	if cache_fp == file_fp:
846 by Martin Pool - start adding refactored/simplified hash cache	164	self.hit_count += 1
860 by Martin Pool - refactor hashcache to use just one dictionary	165	return cache_sha1
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	166
	167	self.miss_count += 1
1092.2.6 by Robert Collins symlink support updated to work	168
	169
	170	mode = file_fp[FP_MODE_COLUMN]
	171	if stat.S_ISREG(mode):
	172	digest = sha_file(file(abspath, 'rb', buffering=65000))
	173	elif stat.S_ISLNK(mode):
	174	link_target = os.readlink(abspath)
	175	digest = sha.new(os.readlink(abspath)).hexdigest()
	176	else:
	177	raise BzrError("file %r: unknown file stat mode: %o"%(abspath,mode))
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	178
	179	now = int(time.time())
	180	if file_fp[1] >= now or file_fp[2] >= now:
	181	# changed too recently; can't be cached. we can
	182	# return the result and it could possibly be cached
	183	# next time.
	184	self.danger_count += 1
	185	if cache_fp:
	186	self.removed_count += 1
	187	self.needs_write = True
	188	del self._cache[path]
846 by Martin Pool - start adding refactored/simplified hash cache	189	else:
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	190	self.update_count += 1
	191	self.needs_write = True
	192	self._cache[path] = (digest, file_fp)
	193	return digest
	194
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	195	def write(self):
859 by Martin Pool - add HashCache.write and a simple test for it	196	"""Write contents of cache to file."""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	197	outf = AtomicFile(self.cache_file_name(), 'wb')
859 by Martin Pool - add HashCache.write and a simple test for it	198	try:
862 by Martin Pool - code to re-read hashcache from file	199	print >>outf, CACHE_HEADER,
859 by Martin Pool - add HashCache.write and a simple test for it	200
860 by Martin Pool - refactor hashcache to use just one dictionary	201	for path, c in self._cache.iteritems():
859 by Martin Pool - add HashCache.write and a simple test for it	202	assert '//' not in path, path
	203	outf.write(path.encode('utf-8'))
	204	outf.write('// ')
860 by Martin Pool - refactor hashcache to use just one dictionary	205	print >>outf, c[0], # hex sha1
	206	for fld in c[1]:
862 by Martin Pool - code to re-read hashcache from file	207	print >>outf, "%d" % fld,
859 by Martin Pool - add HashCache.write and a simple test for it	208	print >>outf
	209
	210	outf.commit()
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	211	self.needs_write = False
859 by Martin Pool - add HashCache.write and a simple test for it	212	finally:
	213	if not outf.closed:
	214	outf.abort()
	215
862 by Martin Pool - code to re-read hashcache from file	216
862 by Martin Pool - code to re-read hashcache from file	217
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	218	def read(self):
862 by Martin Pool - code to re-read hashcache from file	219	"""Reinstate cache from file.
	220
	221	Overwrites existing cache.
	222
	223	If the cache file has the wrong version marker, this just clears
	224	the cache."""
	225	self._cache = {}
	226
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	227	fn = self.cache_file_name()
	228	try:
948 by Martin Pool - more buffering when reading/writing hashcache	229	inf = file(fn, 'rb', buffering=65000)
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	230	except IOError, e:
	231	mutter("failed to open %s: %s" % (fn, e))
1214 by Martin Pool - hashcache should be written out if it can't be read	232	# better write it now so it is valid
	233	self.needs_write = True
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	234	return
	235
	236
862 by Martin Pool - code to re-read hashcache from file	237	hdr = inf.readline()
	238	if hdr != CACHE_HEADER:
	239	mutter('cache header marker not found at top of %s; discarding cache'
878 by Martin Pool - fix typo	240	% fn)
1214 by Martin Pool - hashcache should be written out if it can't be read	241	self.needs_write = True
862 by Martin Pool - code to re-read hashcache from file	242	return
	243
	244	for l in inf:
	245	pos = l.index('// ')
	246	path = l[:pos].decode('utf-8')
	247	if path in self._cache:
	248	warning('duplicated path %r in cache' % path)
	249	continue
	250
	251	pos += 3
	252	fields = l[pos:].split(' ')
1092.2.6 by Robert Collins symlink support updated to work	253	if len(fields) != 7:
862 by Martin Pool - code to re-read hashcache from file	254	warning("bad line in hashcache: %r" % l)
	255	continue
	256
	257	sha1 = fields[0]
	258	if len(sha1) != 40:
	259	warning("bad sha1 in hashcache: %r" % sha1)
	260	continue
	261
	262	fp = tuple(map(long, fields[1:]))
	263
	264	self._cache[path] = (sha1, fp)
	265
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	266	self.needs_write = False
	267
	268
862 by Martin Pool - code to re-read hashcache from file	269
862 by Martin Pool - code to re-read hashcache from file	270