~bzr-pqm/bzr/bzr.dev : contents of bzrlib/hashcache.py at revision 974.1.50

~bzr-pqm/bzr/bzr.dev : (revision 974.1.50)

# (C) 2005 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# TODO: Up-front, stat all files in order and remove those which are deleted or 
# out-of-date.  Don't actually re-read them until they're needed.  That ought 
# to bring all the inodes into core so that future stats to them are fast, and 
# it preserves the nice property that any caller will always get up-to-date
# data except in unavoidable cases.

# TODO: Perhaps return more details on the file to avoid statting it
# again: nonexistent, file type, size, etc



CACHE_HEADER = "### bzr hashcache v5\n"

import os, stat, time

from bzrlib.osutils import sha_file
from bzrlib.trace import mutter, warning



def _fingerprint(abspath):
    try:
        fs = os.lstat(abspath)
    except OSError:
        # might be missing, etc
        return None

    if stat.S_ISDIR(fs.st_mode):
        return None

    # we discard any high precision because it's not reliable; perhaps we
    # could do better on some systems?
    return (fs.st_size, long(fs.st_mtime),
            long(fs.st_ctime), fs.st_ino, fs.st_dev)


class HashCache(object):
    """Cache for looking up file SHA-1.

    Files are considered to match the cached value if the fingerprint
    of the file has not changed.  This includes its mtime, ctime,
    device number, inode number, and size.  This should catch
    modifications or replacement of the file by a new one.

    This may not catch modifications that do not change the file's
    size and that occur within the resolution window of the
    timestamps.  To handle this we specifically do not cache files
    which have changed since the start of the present second, since
    they could undetectably change again.

    This scheme may fail if the machine's clock steps backwards.
    Don't do that.

    This does not canonicalize the paths passed in; that should be
    done by the caller.

    _cache
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
        and its fingerprint.

    stat_count
        number of times files have been statted

    hit_count
        number of times files have been retrieved from the cache, avoiding a
        re-read
        
    miss_count
        number of misses (times files have been completely re-read)
    """
    needs_write = False

    def __init__(self, basedir):
        self.basedir = basedir
        self.hit_count = 0
        self.miss_count = 0
        self.stat_count = 0
        self.danger_count = 0
        self.removed_count = 0
        self.update_count = 0
        self._cache = {}


    def cache_file_name(self):
        return os.sep.join([self.basedir, '.bzr', 'stat-cache'])




    def clear(self):
        """Discard all cached information.

        This does not reset the counters."""
        if self._cache:
            self.needs_write = True
            self._cache = {}


    def scan(self):
        """Scan all files and remove entries where the cache entry is obsolete.
        
        Obsolete entries are those where the file has been modified or deleted
        since the entry was inserted.        
        """
        prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]
        prep.sort()
        
        for inum, path, cache_entry in prep:
            abspath = os.sep.join([self.basedir, path])
            fp = _fingerprint(abspath)
            self.stat_count += 1
            
            cache_fp = cache_entry[1]
    
            if (not fp) or (cache_fp != fp):
                # not here or not a regular file anymore
                self.removed_count += 1
                self.needs_write = True
                del self._cache[path]



    def get_sha1(self, path):
        """Return the sha1 of a file.
        """
        abspath = os.sep.join([self.basedir, path])
        self.stat_count += 1
        file_fp = _fingerprint(abspath)
        
        if not file_fp:
            # not a regular file or not existing
            if path in self._cache:
                self.removed_count += 1
                self.needs_write = True
                del self._cache[path]
            return None        

        if path in self._cache:
            cache_sha1, cache_fp = self._cache[path]
        else:
            cache_sha1, cache_fp = None, None

        if cache_fp == file_fp:
            self.hit_count += 1
            return cache_sha1
        
        self.miss_count += 1
        digest = sha_file(file(abspath, 'rb', buffering=65000))

        now = int(time.time())
        if file_fp[1] >= now or file_fp[2] >= now:
            # changed too recently; can't be cached.  we can
            # return the result and it could possibly be cached
            # next time.
            self.danger_count += 1 
            if cache_fp:
                self.removed_count += 1
                self.needs_write = True
                del self._cache[path]
        else:
            self.update_count += 1
            self.needs_write = True
            self._cache[path] = (digest, file_fp)
        
        return digest
        



    def write(self):
        """Write contents of cache to file."""
        from atomicfile import AtomicFile

        outf = AtomicFile(self.cache_file_name(), 'wb')
        try:
            print >>outf, CACHE_HEADER,

            for path, c  in self._cache.iteritems():
                assert '//' not in path, path
                outf.write(path.encode('utf-8'))
                outf.write('// ')
                print >>outf, c[0],     # hex sha1
                for fld in c[1]:
                    print >>outf, "%d" % fld,
                print >>outf

            outf.commit()
            self.needs_write = False
        finally:
            if not outf.closed:
                outf.abort()
        


    def read(self):
        """Reinstate cache from file.

        Overwrites existing cache.

        If the cache file has the wrong version marker, this just clears 
        the cache."""
        self._cache = {}

        fn = self.cache_file_name()
        try:
            inf = file(fn, 'rb', buffering=65000)
        except IOError, e:
            mutter("failed to open %s: %s" % (fn, e))
            return


        hdr = inf.readline()
        if hdr != CACHE_HEADER:
            mutter('cache header marker not found at top of %s; discarding cache'
                   % fn)
            return

        for l in inf:
            pos = l.index('// ')
            path = l[:pos].decode('utf-8')
            if path in self._cache:
                warning('duplicated path %r in cache' % path)
                continue

            pos += 3
            fields = l[pos:].split(' ')
            if len(fields) != 6:
                warning("bad line in hashcache: %r" % l)
                continue

            sha1 = fields[0]
            if len(sha1) != 40:
                warning("bad sha1 in hashcache: %r" % sha1)
                continue

            fp = tuple(map(long, fields[1:]))

            self._cache[path] = (sha1, fp)

        self.needs_write = False
           


        

846 by Martin Pool - start adding refactored/simplified hash cache	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
953 by Martin Pool - refactor imports and stats for hashcache	17	# TODO: Up-front, stat all files in order and remove those which are deleted or
	18	# out-of-date. Don't actually re-read them until they're needed. That ought
	19	# to bring all the inodes into core so that future stats to them are fast, and
	20	# it preserves the nice property that any caller will always get up-to-date
	21	# data except in unavoidable cases.
864 by Martin Pool doc	22
	23	# TODO: Perhaps return more details on the file to avoid statting it
	24	# again: nonexistent, file type, size, etc
	25
	26
	27
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	28	CACHE_HEADER = "### bzr hashcache v5\n"
859 by Martin Pool - add HashCache.write and a simple test for it	29
953 by Martin Pool - refactor imports and stats for hashcache	30	import os, stat, time
	31
	32	from bzrlib.osutils import sha_file
	33	from bzrlib.trace import mutter, warning
	34
	35
859 by Martin Pool - add HashCache.write and a simple test for it	36
846 by Martin Pool - start adding refactored/simplified hash cache	37	def _fingerprint(abspath):
	38	try:
	39	fs = os.lstat(abspath)
	40	except OSError:
	41	# might be missing, etc
	42	return None
	43
	44	if stat.S_ISDIR(fs.st_mode):
	45	return None
	46
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	47	# we discard any high precision because it's not reliable; perhaps we
	48	# could do better on some systems?
	49	return (fs.st_size, long(fs.st_mtime),
	50	long(fs.st_ctime), fs.st_ino, fs.st_dev)
846 by Martin Pool - start adding refactored/simplified hash cache	51
	52
	53	class HashCache(object):
	54	"""Cache for looking up file SHA-1.
	55
	56	Files are considered to match the cached value if the fingerprint
	57	of the file has not changed. This includes its mtime, ctime,
	58	device number, inode number, and size. This should catch
	59	modifications or replacement of the file by a new one.
	60
	61	This may not catch modifications that do not change the file's
	62	size and that occur within the resolution window of the
	63	timestamps. To handle this we specifically do not cache files
	64	which have changed since the start of the present second, since
	65	they could undetectably change again.
	66
	67	This scheme may fail if the machine's clock steps backwards.
	68	Don't do that.
	69
	70	This does not canonicalize the paths passed in; that should be
	71	done by the caller.
	72
860 by Martin Pool - refactor hashcache to use just one dictionary	73	_cache
	74	Indexed by path, points to a two-tuple of the SHA-1 of the file.
	75	and its fingerprint.
846 by Martin Pool - start adding refactored/simplified hash cache	76
	77	stat_count
	78	number of times files have been statted
	79
	80	hit_count
	81	number of times files have been retrieved from the cache, avoiding a
	82	re-read
	83
	84	miss_count
	85	number of misses (times files have been completely re-read)
	86	"""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	87	needs_write = False
	88
846 by Martin Pool - start adding refactored/simplified hash cache	89	def __init__(self, basedir):
	90	self.basedir = basedir
	91	self.hit_count = 0
	92	self.miss_count = 0
	93	self.stat_count = 0
	94	self.danger_count = 0
953 by Martin Pool - refactor imports and stats for hashcache	95	self.removed_count = 0
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	96	self.update_count = 0
860 by Martin Pool - refactor hashcache to use just one dictionary	97	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	98
	99
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	100	def cache_file_name(self):
953 by Martin Pool - refactor imports and stats for hashcache	101	return os.sep.join([self.basedir, '.bzr', 'stat-cache'])
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	102
	103
	104
	105
846 by Martin Pool - start adding refactored/simplified hash cache	106	def clear(self):
860 by Martin Pool - refactor hashcache to use just one dictionary	107	"""Discard all cached information.
	108
	109	This does not reset the counters."""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	110	if self._cache:
	111	self.needs_write = True
	112	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	113
	114
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	115	def scan(self):
	116	"""Scan all files and remove entries where the cache entry is obsolete.
	117
	118	Obsolete entries are those where the file has been modified or deleted
	119	since the entry was inserted.
	120	"""
	121	prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]
953 by Martin Pool - refactor imports and stats for hashcache	122	prep.sort()
	123
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	124	for inum, path, cache_entry in prep:
	125	abspath = os.sep.join([self.basedir, path])
	126	fp = _fingerprint(abspath)
	127	self.stat_count += 1
	128
	129	cache_fp = cache_entry[1]
	130
	131	if (not fp) or (cache_fp != fp):
	132	# not here or not a regular file anymore
	133	self.removed_count += 1
	134	self.needs_write = True
	135	del self._cache[path]
	136
953 by Martin Pool - refactor imports and stats for hashcache	137
	138
846 by Martin Pool - start adding refactored/simplified hash cache	139	def get_sha1(self, path):
953 by Martin Pool - refactor imports and stats for hashcache	140	"""Return the sha1 of a file.
846 by Martin Pool - start adding refactored/simplified hash cache	141	"""
953 by Martin Pool - refactor imports and stats for hashcache	142	abspath = os.sep.join([self.basedir, path])
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	143	self.stat_count += 1
	144	file_fp = _fingerprint(abspath)
	145
	146	if not file_fp:
	147	# not a regular file or not existing
	148	if path in self._cache:
	149	self.removed_count += 1
	150	self.needs_write = True
	151	del self._cache[path]
	152	return None
953 by Martin Pool - refactor imports and stats for hashcache	153
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	154	if path in self._cache:
	155	cache_sha1, cache_fp = self._cache[path]
860 by Martin Pool - refactor hashcache to use just one dictionary	156	else:
	157	cache_sha1, cache_fp = None, None
846 by Martin Pool - start adding refactored/simplified hash cache	158
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	159	if cache_fp == file_fp:
846 by Martin Pool - start adding refactored/simplified hash cache	160	self.hit_count += 1
860 by Martin Pool - refactor hashcache to use just one dictionary	161	return cache_sha1
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	162
	163	self.miss_count += 1
	164	digest = sha_file(file(abspath, 'rb', buffering=65000))
	165
	166	now = int(time.time())
	167	if file_fp[1] >= now or file_fp[2] >= now:
	168	# changed too recently; can't be cached. we can
	169	# return the result and it could possibly be cached
	170	# next time.
	171	self.danger_count += 1
	172	if cache_fp:
	173	self.removed_count += 1
	174	self.needs_write = True
	175	del self._cache[path]
846 by Martin Pool - start adding refactored/simplified hash cache	176	else:
954 by Martin Pool - separate out code that just scans the hash cache to find files that are possibly	177	self.update_count += 1
	178	self.needs_write = True
	179	self._cache[path] = (digest, file_fp)
	180
	181	return digest
	182
846 by Martin Pool - start adding refactored/simplified hash cache	183
859 by Martin Pool - add HashCache.write and a simple test for it	184
	185
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	186	def write(self):
859 by Martin Pool - add HashCache.write and a simple test for it	187	"""Write contents of cache to file."""
	188	from atomicfile import AtomicFile
	189
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	190	outf = AtomicFile(self.cache_file_name(), 'wb')
859 by Martin Pool - add HashCache.write and a simple test for it	191	try:
862 by Martin Pool - code to re-read hashcache from file	192	print >>outf, CACHE_HEADER,
859 by Martin Pool - add HashCache.write and a simple test for it	193
860 by Martin Pool - refactor hashcache to use just one dictionary	194	for path, c in self._cache.iteritems():
859 by Martin Pool - add HashCache.write and a simple test for it	195	assert '//' not in path, path
	196	outf.write(path.encode('utf-8'))
	197	outf.write('// ')
860 by Martin Pool - refactor hashcache to use just one dictionary	198	print >>outf, c[0], # hex sha1
	199	for fld in c[1]:
862 by Martin Pool - code to re-read hashcache from file	200	print >>outf, "%d" % fld,
859 by Martin Pool - add HashCache.write and a simple test for it	201	print >>outf
	202
	203	outf.commit()
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	204	self.needs_write = False
859 by Martin Pool - add HashCache.write and a simple test for it	205	finally:
	206	if not outf.closed:
	207	outf.abort()
	208
862 by Martin Pool - code to re-read hashcache from file	209
862 by Martin Pool - code to re-read hashcache from file	210
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	211	def read(self):
862 by Martin Pool - code to re-read hashcache from file	212	"""Reinstate cache from file.
	213
	214	Overwrites existing cache.
	215
	216	If the cache file has the wrong version marker, this just clears
	217	the cache."""
	218	self._cache = {}
	219
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	220	fn = self.cache_file_name()
	221	try:
948 by Martin Pool - more buffering when reading/writing hashcache	222	inf = file(fn, 'rb', buffering=65000)
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	223	except IOError, e:
	224	mutter("failed to open %s: %s" % (fn, e))
	225	return
	226
	227
862 by Martin Pool - code to re-read hashcache from file	228	hdr = inf.readline()
	229	if hdr != CACHE_HEADER:
	230	mutter('cache header marker not found at top of %s; discarding cache'
878 by Martin Pool - fix typo	231	% fn)
862 by Martin Pool - code to re-read hashcache from file	232	return
	233
	234	for l in inf:
	235	pos = l.index('// ')
	236	path = l[:pos].decode('utf-8')
	237	if path in self._cache:
	238	warning('duplicated path %r in cache' % path)
	239	continue
	240
	241	pos += 3
	242	fields = l[pos:].split(' ')
	243	if len(fields) != 6:
	244	warning("bad line in hashcache: %r" % l)
	245	continue
	246
	247	sha1 = fields[0]
	248	if len(sha1) != 40:
	249	warning("bad sha1 in hashcache: %r" % sha1)
	250	continue
	251
	252	fp = tuple(map(long, fields[1:]))
	253
	254	self._cache[path] = (sha1, fp)
	255
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	256	self.needs_write = False
	257
	258
862 by Martin Pool - code to re-read hashcache from file	259
862 by Martin Pool - code to re-read hashcache from file	260