~bzr-pqm/bzr/bzr.dev : contents of bzrlib/cache.py at revision 403

~bzr-pqm/bzr/bzr.dev : (revision 403)

# (C) 2005 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

import stat, os, sha, time
from binascii import b2a_qp, a2b_qp

from trace import mutter


# file fingerprints are: (path, size, mtime, ctime, ino, dev).
#
# if this is the same for this file as in the previous revision, we
# assume the content is the same and the SHA-1 is the same.

# This is stored in a fingerprint file that also contains the file-id
# and the content SHA-1.

# Thus for any given file we can quickly get the SHA-1, either from
# the cache or if the cache is out of date.

# At the moment this is stored in a simple textfile; it might be nice
# to use a tdb instead.


# What we need:

# build a new cache from scratch
# load cache, incrementally update it

# TODO: Have a paranoid mode where we always compare the texts and
# always recalculate the digest, to trap modification without stat
# change and SHA collisions.



def fingerprint(path, abspath):
    try:
        fs = os.lstat(abspath)
    except OSError:
        # might be missing, etc
        return None

    if stat.S_ISDIR(fs.st_mode):
        return None

    return (fs.st_size, fs.st_mtime,
            fs.st_ctime, fs.st_ino, fs.st_dev)


def write_cache(branch, entry_iter):
    outf = branch.controlfile('work-cache.tmp', 'wt')
    for entry in entry_iter:
        outf.write(entry[0] + ' ' + entry[1] + ' ')
        outf.write(b2a_qp(entry[2], True))
        outf.write(' %d %d %d %d %d\n' % entry[3:])
        
    outf.close()
    os.rename(branch.controlfilename('work-cache.tmp'),
              branch.controlfilename('work-cache'))

        
        
def load_cache(branch):
    cache = {}

    try:
        cachefile = branch.controlfile('work-cache', 'rt')
    except IOError:
        return cache
    
    for l in cachefile:
        f = l.split(' ')
        file_id = f[0]
        if file_id in cache:
            raise BzrError("duplicated file_id in cache: {%s}" % file_id)
        cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])
    return cache




def _files_from_inventory(inv):
    for path, ie in inv.iter_entries():
        if ie.kind != 'file':
            continue
        yield ie.file_id, path
    

def build_cache(branch):
    inv = branch.read_working_inventory()

    cache = {}
    _update_cache_from_list(branch, cache, _files_from_inventory(inv))
    


def update_cache(branch, inv):
    # TODO: It's supposed to be faster to stat the files in order by inum.
    # We don't directly know the inum of the files of course but we do
    # know where they were last sighted, so we can sort by that.

    cache = load_cache(branch)
    return _update_cache_from_list(branch, cache, _files_from_inventory(inv))



def _update_cache_from_list(branch, cache, to_update):
    """Update the cache to have info on the named files.

    to_update is a sequence of (file_id, path) pairs.
    """
    hardcheck = dirty = 0
    for file_id, path in to_update:
        fap = branch.abspath(path)
        fp = fingerprint(fap, path)
        cacheentry = cache.get(file_id)

        if fp == None: # not here
            if cacheentry:
                del cache[file_id]
                dirty += 1
            continue

        if cacheentry and (cacheentry[3:] == fp):
            continue                    # all stat fields unchanged

        hardcheck += 1

        dig = sha.new(file(fap, 'rb').read()).hexdigest()

        if cacheentry == None or dig != cacheentry[1]: 
            # if there was no previous entry for this file, or if the
            # SHA has changed, then update the cache
            cacheentry = (file_id, dig, path) + fp
            cache[file_id] = cacheentry
            dirty += 1

    mutter('work cache: read %d files, %d changed' % (hardcheck, dirty))
        
    if dirty:
        write_cache(branch, cache.itervalues())

    return cache

362 by Martin Pool - Import stat-cache code	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	import stat, os, sha, time
	18	from binascii import b2a_qp, a2b_qp
	19
	20	from trace import mutter
	21
	22
	23	# file fingerprints are: (path, size, mtime, ctime, ino, dev).
	24	#
	25	# if this is the same for this file as in the previous revision, we
	26	# assume the content is the same and the SHA-1 is the same.
	27
	28	# This is stored in a fingerprint file that also contains the file-id
	29	# and the content SHA-1.
	30
	31	# Thus for any given file we can quickly get the SHA-1, either from
	32	# the cache or if the cache is out of date.
	33
	34	# At the moment this is stored in a simple textfile; it might be nice
	35	# to use a tdb instead.
	36
	37
	38	# What we need:
	39
	40	# build a new cache from scratch
	41	# load cache, incrementally update it
	42
	43	# TODO: Have a paranoid mode where we always compare the texts and
	44	# always recalculate the digest, to trap modification without stat
	45	# change and SHA collisions.
	46
	47
	48
	49	def fingerprint(path, abspath):
	50	try:
	51	fs = os.lstat(abspath)
	52	except OSError:
	53	# might be missing, etc
	54	return None
	55
	56	if stat.S_ISDIR(fs.st_mode):
	57	return None
	58
	59	return (fs.st_size, fs.st_mtime,
	60	fs.st_ctime, fs.st_ino, fs.st_dev)
	61
	62
	63	def write_cache(branch, entry_iter):
	64	outf = branch.controlfile('work-cache.tmp', 'wt')
65	for entry in entry_iter:
66	outf.write(entry[0] + ' ' + entry[1] + ' ')
67	outf.write(b2a_qp(entry[2], True))
68	outf.write(' %d %d %d %d %d\n' % entry[3:])
69
70	outf.close()
71	os.rename(branch.controlfilename('work-cache.tmp'),
72	branch.controlfilename('work-cache'))
73
74
75
76	def load_cache(branch):
77	cache = {}
78
79	try:
80	cachefile = branch.controlfile('work-cache', 'rt')
81	except IOError:
82	return cache
83
84	for l in cachefile:
85	f = l.split(' ')
86	file_id = f[0]
87	if file_id in cache:
88	raise BzrError("duplicated file_id in cache: {%s}" % file_id)
89	cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])
90	return cache
91
92
93
94
95	def _files_from_inventory(inv):
96	for path, ie in inv.iter_entries():
97	if ie.kind != 'file':
98	continue
99	yield ie.file_id, path
100
101
102	def build_cache(branch):
103	inv = branch.read_working_inventory()
104
105	cache = {}
106	_update_cache_from_list(branch, cache, _files_from_inventory(inv))
107
108
109
110	def update_cache(branch, inv):
111	# TODO: It's supposed to be faster to stat the files in order by inum.
112	# We don't directly know the inum of the files of course but we do
113	# know where they were last sighted, so we can sort by that.
114
115	cache = load_cache(branch)
116	return _update_cache_from_list(branch, cache, _files_from_inventory(inv))
117
118
119
120	def _update_cache_from_list(branch, cache, to_update):
121	"""Update the cache to have info on the named files.
122
123	to_update is a sequence of (file_id, path) pairs.
124	"""
125	hardcheck = dirty = 0
126	for file_id, path in to_update:
127	fap = branch.abspath(path)
128	fp = fingerprint(fap, path)
129	cacheentry = cache.get(file_id)
130
131	if fp == None: # not here
132	if cacheentry:
133	del cache[file_id]
134	dirty += 1
135	continue
136
137	if cacheentry and (cacheentry[3:] == fp):
138	continue # all stat fields unchanged
139
140	hardcheck += 1
141
142	dig = sha.new(file(fap, 'rb').read()).hexdigest()
143
144	if cacheentry == None or dig != cacheentry[1]:
145	# if there was no previous entry for this file, or if the
146	# SHA has changed, then update the cache
147	cacheentry = (file_id, dig, path) + fp
148	cache[file_id] = cacheentry
149	dirty += 1
150
151	mutter('work cache: read %d files, %d changed' % (hardcheck, dirty))
152
153	if dirty:
154	write_cache(branch, cache.itervalues())
155
156	return cache