~bzr-pqm/bzr/bzr.dev : contents of bzrlib/hashcache.py at revision 953

~bzr-pqm/bzr/bzr.dev : (revision 953)

# (C) 2005 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# TODO: Up-front, stat all files in order and remove those which are deleted or 
# out-of-date.  Don't actually re-read them until they're needed.  That ought 
# to bring all the inodes into core so that future stats to them are fast, and 
# it preserves the nice property that any caller will always get up-to-date
# data except in unavoidable cases.

# TODO: Perhaps return more details on the file to avoid statting it
# again: nonexistent, file type, size, etc



CACHE_HEADER = "### bzr hashcache v5\n"

import os, stat, time

from bzrlib.osutils import sha_file
from bzrlib.trace import mutter, warning



def _fingerprint(abspath):
    try:
        fs = os.lstat(abspath)
    except OSError:
        # might be missing, etc
        return None

    if stat.S_ISDIR(fs.st_mode):
        return None

    # we discard any high precision because it's not reliable; perhaps we
    # could do better on some systems?
    return (fs.st_size, long(fs.st_mtime),
            long(fs.st_ctime), fs.st_ino, fs.st_dev)


class HashCache(object):
    """Cache for looking up file SHA-1.

    Files are considered to match the cached value if the fingerprint
    of the file has not changed.  This includes its mtime, ctime,
    device number, inode number, and size.  This should catch
    modifications or replacement of the file by a new one.

    This may not catch modifications that do not change the file's
    size and that occur within the resolution window of the
    timestamps.  To handle this we specifically do not cache files
    which have changed since the start of the present second, since
    they could undetectably change again.

    This scheme may fail if the machine's clock steps backwards.
    Don't do that.

    This does not canonicalize the paths passed in; that should be
    done by the caller.

    _cache
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
        and its fingerprint.

    stat_count
        number of times files have been statted

    hit_count
        number of times files have been retrieved from the cache, avoiding a
        re-read
        
    miss_count
        number of misses (times files have been completely re-read)
    """
    needs_write = False

    def __init__(self, basedir):
        self.basedir = basedir
        self.hit_count = 0
        self.miss_count = 0
        self.stat_count = 0
        self.danger_count = 0
        self.gone_count = 0
        self.removed_count = 0
        self._cache = {}


    def cache_file_name(self):
        return os.sep.join([self.basedir, '.bzr', 'stat-cache'])




    def clear(self):
        """Discard all cached information.

        This does not reset the counters."""
        if self._cache:
            self.needs_write = True
            self._cache = {}


    def refresh_all(self):
        prep = [(ce[1][3], path) for (path, ce) in self._cache.iteritems()]
        prep.sort()
        
        for inum, path in prep:
            # we don't really need to re-hash them; we just need to check 
            # if they're up to date
            self.get_sha1(path)


    def get_sha1(self, path):
        """Return the sha1 of a file.
        """
        abspath = os.sep.join([self.basedir, path])
        fp = _fingerprint(abspath)

        c = self._cache.get(path)
        if c:
            cache_sha1, cache_fp = c
        else:
            cache_sha1, cache_fp = None, None

        self.stat_count += 1

        if not fp:
            # not a regular file
            if path in self._cache:
                self.removed_count += 1
                self.needs_write = True
                del self._cache[path]
            return None
        elif cache_fp and (cache_fp == fp):
            self.hit_count += 1
            return cache_sha1
        else:
            self.miss_count += 1
            digest = sha_file(file(abspath, 'rb', buffering=65000))

            now = int(time.time())
            if fp[1] >= now or fp[2] >= now:
                # changed too recently; can't be cached.  we can
                # return the result and it could possibly be cached
                # next time.
                self.danger_count += 1 
                if cache_fp:
                    self.removed_count += 1
                    self.needs_write = True
                    del self._cache[path]
            elif (fp != cache_fp) or (digest != cache_sha1):
#                 mutter("update entry for %s" % path)
#                 mutter("  %r" % (fp,))
#                 mutter("  %r" % (cache_fp,))
                self.needs_write = True
                self._cache[path] = (digest, fp)
            else:
                # huh?
                assert 0
            
            return digest
            



    def write(self):
        """Write contents of cache to file."""
        from atomicfile import AtomicFile

        outf = AtomicFile(self.cache_file_name(), 'wb')
        try:
            print >>outf, CACHE_HEADER,

            for path, c  in self._cache.iteritems():
                assert '//' not in path, path
                outf.write(path.encode('utf-8'))
                outf.write('// ')
                print >>outf, c[0],     # hex sha1
                for fld in c[1]:
                    print >>outf, "%d" % fld,
                print >>outf

            outf.commit()
            self.needs_write = False
        finally:
            if not outf.closed:
                outf.abort()
        


    def read(self):
        """Reinstate cache from file.

        Overwrites existing cache.

        If the cache file has the wrong version marker, this just clears 
        the cache."""
        self._cache = {}

        fn = self.cache_file_name()
        try:
            inf = file(fn, 'rb', buffering=65000)
        except IOError, e:
            mutter("failed to open %s: %s" % (fn, e))
            return


        hdr = inf.readline()
        if hdr != CACHE_HEADER:
            mutter('cache header marker not found at top of %s; discarding cache'
                   % fn)
            return

        for l in inf:
            pos = l.index('// ')
            path = l[:pos].decode('utf-8')
            if path in self._cache:
                warning('duplicated path %r in cache' % path)
                continue

            pos += 3
            fields = l[pos:].split(' ')
            if len(fields) != 6:
                warning("bad line in hashcache: %r" % l)
                continue

            sha1 = fields[0]
            if len(sha1) != 40:
                warning("bad sha1 in hashcache: %r" % sha1)
                continue

            fp = tuple(map(long, fields[1:]))

            self._cache[path] = (sha1, fp)

        self.needs_write = False
           


        

846 by Martin Pool - start adding refactored/simplified hash cache	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
953 by Martin Pool - refactor imports and stats for hashcache	17	# TODO: Up-front, stat all files in order and remove those which are deleted or
	18	# out-of-date. Don't actually re-read them until they're needed. That ought
	19	# to bring all the inodes into core so that future stats to them are fast, and
	20	# it preserves the nice property that any caller will always get up-to-date
	21	# data except in unavoidable cases.
864 by Martin Pool doc	22
	23	# TODO: Perhaps return more details on the file to avoid statting it
	24	# again: nonexistent, file type, size, etc
	25
	26
	27
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	28	CACHE_HEADER = "### bzr hashcache v5\n"
859 by Martin Pool - add HashCache.write and a simple test for it	29
953 by Martin Pool - refactor imports and stats for hashcache	30	import os, stat, time
	31
	32	from bzrlib.osutils import sha_file
	33	from bzrlib.trace import mutter, warning
	34
	35
859 by Martin Pool - add HashCache.write and a simple test for it	36
846 by Martin Pool - start adding refactored/simplified hash cache	37	def _fingerprint(abspath):
	38	try:
	39	fs = os.lstat(abspath)
	40	except OSError:
	41	# might be missing, etc
	42	return None
	43
	44	if stat.S_ISDIR(fs.st_mode):
	45	return None
	46
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	47	# we discard any high precision because it's not reliable; perhaps we
	48	# could do better on some systems?
	49	return (fs.st_size, long(fs.st_mtime),
	50	long(fs.st_ctime), fs.st_ino, fs.st_dev)
846 by Martin Pool - start adding refactored/simplified hash cache	51
	52
	53	class HashCache(object):
	54	"""Cache for looking up file SHA-1.
	55
	56	Files are considered to match the cached value if the fingerprint
	57	of the file has not changed. This includes its mtime, ctime,
	58	device number, inode number, and size. This should catch
	59	modifications or replacement of the file by a new one.
	60
	61	This may not catch modifications that do not change the file's
	62	size and that occur within the resolution window of the
	63	timestamps. To handle this we specifically do not cache files
	64	which have changed since the start of the present second, since
	65	they could undetectably change again.
	66
	67	This scheme may fail if the machine's clock steps backwards.
	68	Don't do that.
	69
	70	This does not canonicalize the paths passed in; that should be
	71	done by the caller.
	72
860 by Martin Pool - refactor hashcache to use just one dictionary	73	_cache
	74	Indexed by path, points to a two-tuple of the SHA-1 of the file.
	75	and its fingerprint.
846 by Martin Pool - start adding refactored/simplified hash cache	76
	77	stat_count
	78	number of times files have been statted
	79
	80	hit_count
	81	number of times files have been retrieved from the cache, avoiding a
	82	re-read
	83
	84	miss_count
	85	number of misses (times files have been completely re-read)
	86	"""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	87	needs_write = False
	88
846 by Martin Pool - start adding refactored/simplified hash cache	89	def __init__(self, basedir):
	90	self.basedir = basedir
	91	self.hit_count = 0
	92	self.miss_count = 0
	93	self.stat_count = 0
	94	self.danger_count = 0
953 by Martin Pool - refactor imports and stats for hashcache	95	self.gone_count = 0
	96	self.removed_count = 0
860 by Martin Pool - refactor hashcache to use just one dictionary	97	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	98
	99
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	100	def cache_file_name(self):
953 by Martin Pool - refactor imports and stats for hashcache	101	return os.sep.join([self.basedir, '.bzr', 'stat-cache'])
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	102
	103
	104
	105
846 by Martin Pool - start adding refactored/simplified hash cache	106	def clear(self):
860 by Martin Pool - refactor hashcache to use just one dictionary	107	"""Discard all cached information.
	108
	109	This does not reset the counters."""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	110	if self._cache:
	111	self.needs_write = True
	112	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	113
	114
953 by Martin Pool - refactor imports and stats for hashcache	115	def refresh_all(self):
	116	prep = [(ce[1][3], path) for (path, ce) in self._cache.iteritems()]
	117	prep.sort()
	118
	119	for inum, path in prep:
	120	# we don't really need to re-hash them; we just need to check
	121	# if they're up to date
	122	self.get_sha1(path)
	123
	124
846 by Martin Pool - start adding refactored/simplified hash cache	125	def get_sha1(self, path):
953 by Martin Pool - refactor imports and stats for hashcache	126	"""Return the sha1 of a file.
846 by Martin Pool - start adding refactored/simplified hash cache	127	"""
953 by Martin Pool - refactor imports and stats for hashcache	128	abspath = os.sep.join([self.basedir, path])
846 by Martin Pool - start adding refactored/simplified hash cache	129	fp = _fingerprint(abspath)
953 by Martin Pool - refactor imports and stats for hashcache	130
860 by Martin Pool - refactor hashcache to use just one dictionary	131	c = self._cache.get(path)
	132	if c:
	133	cache_sha1, cache_fp = c
	134	else:
	135	cache_sha1, cache_fp = None, None
846 by Martin Pool - start adding refactored/simplified hash cache	136
	137	self.stat_count += 1
	138
	139	if not fp:
	140	# not a regular file
953 by Martin Pool - refactor imports and stats for hashcache	141	if path in self._cache:
	142	self.removed_count += 1
	143	self.needs_write = True
	144	del self._cache[path]
846 by Martin Pool - start adding refactored/simplified hash cache	145	return None
	146	elif cache_fp and (cache_fp == fp):
	147	self.hit_count += 1
860 by Martin Pool - refactor hashcache to use just one dictionary	148	return cache_sha1
846 by Martin Pool - start adding refactored/simplified hash cache	149	else:
	150	self.miss_count += 1
948 by Martin Pool - more buffering when reading/writing hashcache	151	digest = sha_file(file(abspath, 'rb', buffering=65000))
846 by Martin Pool - start adding refactored/simplified hash cache	152
	153	now = int(time.time())
	154	if fp[1] >= now or fp[2] >= now:
	155	# changed too recently; can't be cached. we can
	156	# return the result and it could possibly be cached
	157	# next time.
	158	self.danger_count += 1
	159	if cache_fp:
953 by Martin Pool - refactor imports and stats for hashcache	160	self.removed_count += 1
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	161	self.needs_write = True
860 by Martin Pool - refactor hashcache to use just one dictionary	162	del self._cache[path]
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	163	elif (fp != cache_fp) or (digest != cache_sha1):
953 by Martin Pool - refactor imports and stats for hashcache	164	# mutter("update entry for %s" % path)
	165	# mutter(" %r" % (fp,))
	166	# mutter(" %r" % (cache_fp,))
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	167	self.needs_write = True
860 by Martin Pool - refactor hashcache to use just one dictionary	168	self._cache[path] = (digest, fp)
953 by Martin Pool - refactor imports and stats for hashcache	169	else:
	170	# huh?
	171	assert 0
	172
846 by Martin Pool - start adding refactored/simplified hash cache	173	return digest
953 by Martin Pool - refactor imports and stats for hashcache	174
846 by Martin Pool - start adding refactored/simplified hash cache	175
859 by Martin Pool - add HashCache.write and a simple test for it	176
	177
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	178	def write(self):
859 by Martin Pool - add HashCache.write and a simple test for it	179	"""Write contents of cache to file."""
	180	from atomicfile import AtomicFile
	181
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	182	outf = AtomicFile(self.cache_file_name(), 'wb')
859 by Martin Pool - add HashCache.write and a simple test for it	183	try:
862 by Martin Pool - code to re-read hashcache from file	184	print >>outf, CACHE_HEADER,
859 by Martin Pool - add HashCache.write and a simple test for it	185
860 by Martin Pool - refactor hashcache to use just one dictionary	186	for path, c in self._cache.iteritems():
859 by Martin Pool - add HashCache.write and a simple test for it	187	assert '//' not in path, path
	188	outf.write(path.encode('utf-8'))
	189	outf.write('// ')
860 by Martin Pool - refactor hashcache to use just one dictionary	190	print >>outf, c[0], # hex sha1
	191	for fld in c[1]:
862 by Martin Pool - code to re-read hashcache from file	192	print >>outf, "%d" % fld,
859 by Martin Pool - add HashCache.write and a simple test for it	193	print >>outf
	194
	195	outf.commit()
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	196	self.needs_write = False
859 by Martin Pool - add HashCache.write and a simple test for it	197	finally:
	198	if not outf.closed:
	199	outf.abort()
	200
862 by Martin Pool - code to re-read hashcache from file	201
862 by Martin Pool - code to re-read hashcache from file	202
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	203	def read(self):
862 by Martin Pool - code to re-read hashcache from file	204	"""Reinstate cache from file.
	205
	206	Overwrites existing cache.
	207
	208	If the cache file has the wrong version marker, this just clears
	209	the cache."""
	210	self._cache = {}
	211
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	212	fn = self.cache_file_name()
	213	try:
948 by Martin Pool - more buffering when reading/writing hashcache	214	inf = file(fn, 'rb', buffering=65000)
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	215	except IOError, e:
	216	mutter("failed to open %s: %s" % (fn, e))
	217	return
	218
	219
862 by Martin Pool - code to re-read hashcache from file	220	hdr = inf.readline()
	221	if hdr != CACHE_HEADER:
	222	mutter('cache header marker not found at top of %s; discarding cache'
878 by Martin Pool - fix typo	223	% fn)
862 by Martin Pool - code to re-read hashcache from file	224	return
	225
	226	for l in inf:
	227	pos = l.index('// ')
	228	path = l[:pos].decode('utf-8')
	229	if path in self._cache:
	230	warning('duplicated path %r in cache' % path)
	231	continue
	232
	233	pos += 3
	234	fields = l[pos:].split(' ')
	235	if len(fields) != 6:
	236	warning("bad line in hashcache: %r" % l)
	237	continue
	238
	239	sha1 = fields[0]
	240	if len(sha1) != 40:
	241	warning("bad sha1 in hashcache: %r" % sha1)
	242	continue
	243
	244	fp = tuple(map(long, fields[1:]))
	245
	246	self._cache[path] = (sha1, fp)
	247
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	248	self.needs_write = False
	249
	250
862 by Martin Pool - code to re-read hashcache from file	251
862 by Martin Pool - code to re-read hashcache from file	252