~bzr-pqm/bzr/bzr.dev : contents of bzrlib/hashcache.py at revision 920

~bzr-pqm/bzr/bzr.dev : (revision 920)

# (C) 2005 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA



# TODO: Perhaps have a way to stat all the files in inode order, and
# then remember that they're all fresh for the lifetime of the object?

# TODO: Keep track of whether there are in-memory updates that need to
# be flushed.

# TODO: Perhaps return more details on the file to avoid statting it
# again: nonexistent, file type, size, etc




CACHE_HEADER = "### bzr hashcache v5\n"


def _fingerprint(abspath):
    import os, stat

    try:
        fs = os.lstat(abspath)
    except OSError:
        # might be missing, etc
        return None

    if stat.S_ISDIR(fs.st_mode):
        return None

    # we discard any high precision because it's not reliable; perhaps we
    # could do better on some systems?
    return (fs.st_size, long(fs.st_mtime),
            long(fs.st_ctime), fs.st_ino, fs.st_dev)


class HashCache(object):
    """Cache for looking up file SHA-1.

    Files are considered to match the cached value if the fingerprint
    of the file has not changed.  This includes its mtime, ctime,
    device number, inode number, and size.  This should catch
    modifications or replacement of the file by a new one.

    This may not catch modifications that do not change the file's
    size and that occur within the resolution window of the
    timestamps.  To handle this we specifically do not cache files
    which have changed since the start of the present second, since
    they could undetectably change again.

    This scheme may fail if the machine's clock steps backwards.
    Don't do that.

    This does not canonicalize the paths passed in; that should be
    done by the caller.

    _cache
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
        and its fingerprint.

    stat_count
        number of times files have been statted

    hit_count
        number of times files have been retrieved from the cache, avoiding a
        re-read
        
    miss_count
        number of misses (times files have been completely re-read)
    """
    needs_write = False

    def __init__(self, basedir):
        self.basedir = basedir
        self.hit_count = 0
        self.miss_count = 0
        self.stat_count = 0
        self.danger_count = 0
        self._cache = {}



    def cache_file_name(self):
        import os.path
        return os.path.join(self.basedir, '.bzr', 'stat-cache')




    def clear(self):
        """Discard all cached information.

        This does not reset the counters."""
        if self._cache:
            self.needs_write = True
            self._cache = {}


    def get_sha1(self, path):
        """Return the hex SHA-1 of the contents of the file at path.

        XXX: If the file does not exist or is not a plain file???
        """

        import os, time
        from bzrlib.osutils import sha_file
        from bzrlib.trace import mutter
        
        abspath = os.path.join(self.basedir, path)
        fp = _fingerprint(abspath)
        c = self._cache.get(path)
        if c:
            cache_sha1, cache_fp = c
        else:
            cache_sha1, cache_fp = None, None

        self.stat_count += 1

        if not fp:
            # not a regular file
            return None
        elif cache_fp and (cache_fp == fp):
            self.hit_count += 1
            return cache_sha1
        else:
            self.miss_count += 1
            digest = sha_file(file(abspath, 'rb'))

            now = int(time.time())
            if fp[1] >= now or fp[2] >= now:
                # changed too recently; can't be cached.  we can
                # return the result and it could possibly be cached
                # next time.
                self.danger_count += 1 
                if cache_fp:
                    mutter("remove outdated entry for %s" % path)
                    self.needs_write = True
                    del self._cache[path]
            elif (fp != cache_fp) or (digest != cache_sha1):
                mutter("update entry for %s" % path)
                mutter("  %r" % (fp,))
                mutter("  %r" % (cache_fp,))
                self.needs_write = True
                self._cache[path] = (digest, fp)

            return digest



    def write(self):
        """Write contents of cache to file."""
        from atomicfile import AtomicFile

        outf = AtomicFile(self.cache_file_name(), 'wb')
        try:
            print >>outf, CACHE_HEADER,

            for path, c  in self._cache.iteritems():
                assert '//' not in path, path
                outf.write(path.encode('utf-8'))
                outf.write('// ')
                print >>outf, c[0],     # hex sha1
                for fld in c[1]:
                    print >>outf, "%d" % fld,
                print >>outf

            outf.commit()
            self.needs_write = False
        finally:
            if not outf.closed:
                outf.abort()
        


    def read(self):
        """Reinstate cache from file.

        Overwrites existing cache.

        If the cache file has the wrong version marker, this just clears 
        the cache."""
        from bzrlib.trace import mutter, warning

        self._cache = {}

        fn = self.cache_file_name()
        try:
            inf = file(fn, 'rb')
        except IOError, e:
            mutter("failed to open %s: %s" % (fn, e))
            return


        hdr = inf.readline()
        if hdr != CACHE_HEADER:
            mutter('cache header marker not found at top of %s; discarding cache'
                   % fn)
            return

        for l in inf:
            pos = l.index('// ')
            path = l[:pos].decode('utf-8')
            if path in self._cache:
                warning('duplicated path %r in cache' % path)
                continue

            pos += 3
            fields = l[pos:].split(' ')
            if len(fields) != 6:
                warning("bad line in hashcache: %r" % l)
                continue

            sha1 = fields[0]
            if len(sha1) != 40:
                warning("bad sha1 in hashcache: %r" % sha1)
                continue

            fp = tuple(map(long, fields[1:]))

            self._cache[path] = (sha1, fp)

        self.needs_write = False
           


        

846 by Martin Pool - start adding refactored/simplified hash cache	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17
	18
864 by Martin Pool doc	19	# TODO: Perhaps have a way to stat all the files in inode order, and
	20	# then remember that they're all fresh for the lifetime of the object?
	21
	22	# TODO: Keep track of whether there are in-memory updates that need to
	23	# be flushed.
	24
	25	# TODO: Perhaps return more details on the file to avoid statting it
	26	# again: nonexistent, file type, size, etc
	27
	28
	29
846 by Martin Pool - start adding refactored/simplified hash cache	30
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	31	CACHE_HEADER = "### bzr hashcache v5\n"
859 by Martin Pool - add HashCache.write and a simple test for it	32
	33
846 by Martin Pool - start adding refactored/simplified hash cache	34	def _fingerprint(abspath):
	35	import os, stat
	36
	37	try:
	38	fs = os.lstat(abspath)
	39	except OSError:
	40	# might be missing, etc
	41	return None
	42
	43	if stat.S_ISDIR(fs.st_mode):
	44	return None
	45
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	46	# we discard any high precision because it's not reliable; perhaps we
	47	# could do better on some systems?
	48	return (fs.st_size, long(fs.st_mtime),
	49	long(fs.st_ctime), fs.st_ino, fs.st_dev)
846 by Martin Pool - start adding refactored/simplified hash cache	50
	51
	52	class HashCache(object):
	53	"""Cache for looking up file SHA-1.
	54
	55	Files are considered to match the cached value if the fingerprint
	56	of the file has not changed. This includes its mtime, ctime,
	57	device number, inode number, and size. This should catch
	58	modifications or replacement of the file by a new one.
	59
	60	This may not catch modifications that do not change the file's
	61	size and that occur within the resolution window of the
	62	timestamps. To handle this we specifically do not cache files
	63	which have changed since the start of the present second, since
	64	they could undetectably change again.
	65
	66	This scheme may fail if the machine's clock steps backwards.
	67	Don't do that.
	68
	69	This does not canonicalize the paths passed in; that should be
	70	done by the caller.
	71
860 by Martin Pool - refactor hashcache to use just one dictionary	72	_cache
	73	Indexed by path, points to a two-tuple of the SHA-1 of the file.
	74	and its fingerprint.
846 by Martin Pool - start adding refactored/simplified hash cache	75
	76	stat_count
	77	number of times files have been statted
	78
	79	hit_count
	80	number of times files have been retrieved from the cache, avoiding a
	81	re-read
	82
	83	miss_count
	84	number of misses (times files have been completely re-read)
	85	"""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	86	needs_write = False
	87
846 by Martin Pool - start adding refactored/simplified hash cache	88	def __init__(self, basedir):
	89	self.basedir = basedir
	90	self.hit_count = 0
	91	self.miss_count = 0
	92	self.stat_count = 0
	93	self.danger_count = 0
860 by Martin Pool - refactor hashcache to use just one dictionary	94	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	95
	96
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	97
	98	def cache_file_name(self):
	99	import os.path
	100	return os.path.join(self.basedir, '.bzr', 'stat-cache')
	101
	102
	103
	104
846 by Martin Pool - start adding refactored/simplified hash cache	105	def clear(self):
860 by Martin Pool - refactor hashcache to use just one dictionary	106	"""Discard all cached information.
	107
	108	This does not reset the counters."""
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	109	if self._cache:
	110	self.needs_write = True
	111	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	112
	113
	114	def get_sha1(self, path):
	115	"""Return the hex SHA-1 of the contents of the file at path.
	116
	117	XXX: If the file does not exist or is not a plain file???
	118	"""
	119
	120	import os, time
	121	from bzrlib.osutils import sha_file
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	122	from bzrlib.trace import mutter
846 by Martin Pool - start adding refactored/simplified hash cache	123
	124	abspath = os.path.join(self.basedir, path)
	125	fp = _fingerprint(abspath)
860 by Martin Pool - refactor hashcache to use just one dictionary	126	c = self._cache.get(path)
	127	if c:
	128	cache_sha1, cache_fp = c
	129	else:
	130	cache_sha1, cache_fp = None, None
846 by Martin Pool - start adding refactored/simplified hash cache	131
	132	self.stat_count += 1
	133
	134	if not fp:
	135	# not a regular file
	136	return None
	137	elif cache_fp and (cache_fp == fp):
	138	self.hit_count += 1
860 by Martin Pool - refactor hashcache to use just one dictionary	139	return cache_sha1
846 by Martin Pool - start adding refactored/simplified hash cache	140	else:
	141	self.miss_count += 1
	142	digest = sha_file(file(abspath, 'rb'))
	143
	144	now = int(time.time())
	145	if fp[1] >= now or fp[2] >= now:
	146	# changed too recently; can't be cached. we can
	147	# return the result and it could possibly be cached
	148	# next time.
	149	self.danger_count += 1
	150	if cache_fp:
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	151	mutter("remove outdated entry for %s" % path)
	152	self.needs_write = True
860 by Martin Pool - refactor hashcache to use just one dictionary	153	del self._cache[path]
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	154	elif (fp != cache_fp) or (digest != cache_sha1):
	155	mutter("update entry for %s" % path)
	156	mutter(" %r" % (fp,))
	157	mutter(" %r" % (cache_fp,))
	158	self.needs_write = True
860 by Martin Pool - refactor hashcache to use just one dictionary	159	self._cache[path] = (digest, fp)
846 by Martin Pool - start adding refactored/simplified hash cache	160
	161	return digest
	162
859 by Martin Pool - add HashCache.write and a simple test for it	163
	164
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	165	def write(self):
859 by Martin Pool - add HashCache.write and a simple test for it	166	"""Write contents of cache to file."""
	167	from atomicfile import AtomicFile
	168
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	169	outf = AtomicFile(self.cache_file_name(), 'wb')
859 by Martin Pool - add HashCache.write and a simple test for it	170	try:
862 by Martin Pool - code to re-read hashcache from file	171	print >>outf, CACHE_HEADER,
859 by Martin Pool - add HashCache.write and a simple test for it	172
860 by Martin Pool - refactor hashcache to use just one dictionary	173	for path, c in self._cache.iteritems():
859 by Martin Pool - add HashCache.write and a simple test for it	174	assert '//' not in path, path
	175	outf.write(path.encode('utf-8'))
	176	outf.write('// ')
860 by Martin Pool - refactor hashcache to use just one dictionary	177	print >>outf, c[0], # hex sha1
	178	for fld in c[1]:
862 by Martin Pool - code to re-read hashcache from file	179	print >>outf, "%d" % fld,
859 by Martin Pool - add HashCache.write and a simple test for it	180	print >>outf
	181
	182	outf.commit()
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	183	self.needs_write = False
859 by Martin Pool - add HashCache.write and a simple test for it	184	finally:
	185	if not outf.closed:
	186	outf.abort()
	187
862 by Martin Pool - code to re-read hashcache from file	188
862 by Martin Pool - code to re-read hashcache from file	189
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	190	def read(self):
862 by Martin Pool - code to re-read hashcache from file	191	"""Reinstate cache from file.
	192
	193	Overwrites existing cache.
	194
	195	If the cache file has the wrong version marker, this just clears
	196	the cache."""
	197	from bzrlib.trace import mutter, warning
	198
	199	self._cache = {}
	200
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	201	fn = self.cache_file_name()
	202	try:
	203	inf = file(fn, 'rb')
	204	except IOError, e:
	205	mutter("failed to open %s: %s" % (fn, e))
	206	return
	207
	208
862 by Martin Pool - code to re-read hashcache from file	209	hdr = inf.readline()
	210	if hdr != CACHE_HEADER:
	211	mutter('cache header marker not found at top of %s; discarding cache'
878 by Martin Pool - fix typo	212	% fn)
862 by Martin Pool - code to re-read hashcache from file	213	return
	214
	215	for l in inf:
	216	pos = l.index('// ')
	217	path = l[:pos].decode('utf-8')
	218	if path in self._cache:
	219	warning('duplicated path %r in cache' % path)
	220	continue
	221
	222	pos += 3
	223	fields = l[pos:].split(' ')
	224	if len(fields) != 6:
	225	warning("bad line in hashcache: %r" % l)
	226	continue
	227
	228	sha1 = fields[0]
	229	if len(sha1) != 40:
	230	warning("bad sha1 in hashcache: %r" % sha1)
	231	continue
	232
	233	fp = tuple(map(long, fields[1:]))
	234
	235	self._cache[path] = (sha1, fp)
	236
866 by Martin Pool - use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.	237	self.needs_write = False
	238
	239
862 by Martin Pool - code to re-read hashcache from file	240
862 by Martin Pool - code to re-read hashcache from file	241