~bzr-pqm/bzr/bzr.dev

846 by Martin Pool
- start adding refactored/simplified hash cache
1
# (C) 2005 Canonical Ltd
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
18
864 by Martin Pool
doc
19
# TODO: Perhaps have a way to stat all the files in inode order, and
20
# then remember that they're all fresh for the lifetime of the object?
21
22
# TODO: Keep track of whether there are in-memory updates that need to
23
# be flushed.
24
25
# TODO: Perhaps return more details on the file to avoid statting it
26
# again: nonexistent, file type, size, etc
27
28
29
846 by Martin Pool
- start adding refactored/simplified hash cache
30
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
31
CACHE_HEADER = "### bzr hashcache v5\n"
859 by Martin Pool
- add HashCache.write and a simple test for it
32
33
846 by Martin Pool
- start adding refactored/simplified hash cache
34
def _fingerprint(abspath):
35
    import os, stat
36
37
    try:
38
        fs = os.lstat(abspath)
39
    except OSError:
40
        # might be missing, etc
41
        return None
42
43
    if stat.S_ISDIR(fs.st_mode):
44
        return None
45
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
46
    # we discard any high precision because it's not reliable; perhaps we
47
    # could do better on some systems?
48
    return (fs.st_size, long(fs.st_mtime),
49
            long(fs.st_ctime), fs.st_ino, fs.st_dev)
846 by Martin Pool
- start adding refactored/simplified hash cache
50
51
52
class HashCache(object):
53
    """Cache for looking up file SHA-1.
54
55
    Files are considered to match the cached value if the fingerprint
56
    of the file has not changed.  This includes its mtime, ctime,
57
    device number, inode number, and size.  This should catch
58
    modifications or replacement of the file by a new one.
59
60
    This may not catch modifications that do not change the file's
61
    size and that occur within the resolution window of the
62
    timestamps.  To handle this we specifically do not cache files
63
    which have changed since the start of the present second, since
64
    they could undetectably change again.
65
66
    This scheme may fail if the machine's clock steps backwards.
67
    Don't do that.
68
69
    This does not canonicalize the paths passed in; that should be
70
    done by the caller.
71
860 by Martin Pool
- refactor hashcache to use just one dictionary
72
    _cache
73
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
74
        and its fingerprint.
846 by Martin Pool
- start adding refactored/simplified hash cache
75
76
    stat_count
77
        number of times files have been statted
78
79
    hit_count
80
        number of times files have been retrieved from the cache, avoiding a
81
        re-read
82
        
83
    miss_count
84
        number of misses (times files have been completely re-read)
85
    """
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
86
    needs_write = False
87
846 by Martin Pool
- start adding refactored/simplified hash cache
88
    def __init__(self, basedir):
89
        self.basedir = basedir
90
        self.hit_count = 0
91
        self.miss_count = 0
92
        self.stat_count = 0
93
        self.danger_count = 0
860 by Martin Pool
- refactor hashcache to use just one dictionary
94
        self._cache = {}
846 by Martin Pool
- start adding refactored/simplified hash cache
95
96
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
97
98
    def cache_file_name(self):
99
        import os.path
100
        return os.path.join(self.basedir, '.bzr', 'stat-cache')
101
102
103
104
846 by Martin Pool
- start adding refactored/simplified hash cache
105
    def clear(self):
860 by Martin Pool
- refactor hashcache to use just one dictionary
106
        """Discard all cached information.
107
108
        This does not reset the counters."""
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
109
        if self._cache:
110
            self.needs_write = True
111
            self._cache = {}
846 by Martin Pool
- start adding refactored/simplified hash cache
112
113
114
    def get_sha1(self, path):
115
        """Return the hex SHA-1 of the contents of the file at path.
116
117
        XXX: If the file does not exist or is not a plain file???
118
        """
119
120
        import os, time
121
        from bzrlib.osutils import sha_file
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
122
        from bzrlib.trace import mutter
846 by Martin Pool
- start adding refactored/simplified hash cache
123
        
124
        abspath = os.path.join(self.basedir, path)
125
        fp = _fingerprint(abspath)
860 by Martin Pool
- refactor hashcache to use just one dictionary
126
        c = self._cache.get(path)
127
        if c:
128
            cache_sha1, cache_fp = c
129
        else:
130
            cache_sha1, cache_fp = None, None
846 by Martin Pool
- start adding refactored/simplified hash cache
131
132
        self.stat_count += 1
133
134
        if not fp:
135
            # not a regular file
136
            return None
137
        elif cache_fp and (cache_fp == fp):
138
            self.hit_count += 1
860 by Martin Pool
- refactor hashcache to use just one dictionary
139
            return cache_sha1
846 by Martin Pool
- start adding refactored/simplified hash cache
140
        else:
141
            self.miss_count += 1
142
            digest = sha_file(file(abspath, 'rb'))
143
144
            now = int(time.time())
145
            if fp[1] >= now or fp[2] >= now:
146
                # changed too recently; can't be cached.  we can
147
                # return the result and it could possibly be cached
148
                # next time.
149
                self.danger_count += 1 
150
                if cache_fp:
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
151
                    mutter("remove outdated entry for %s" % path)
152
                    self.needs_write = True
860 by Martin Pool
- refactor hashcache to use just one dictionary
153
                    del self._cache[path]
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
154
            elif (fp != cache_fp) or (digest != cache_sha1):
155
                mutter("update entry for %s" % path)
156
                mutter("  %r" % (fp,))
157
                mutter("  %r" % (cache_fp,))
158
                self.needs_write = True
860 by Martin Pool
- refactor hashcache to use just one dictionary
159
                self._cache[path] = (digest, fp)
846 by Martin Pool
- start adding refactored/simplified hash cache
160
161
            return digest
162
859 by Martin Pool
- add HashCache.write and a simple test for it
163
164
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
165
    def write(self):
859 by Martin Pool
- add HashCache.write and a simple test for it
166
        """Write contents of cache to file."""
167
        from atomicfile import AtomicFile
168
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
169
        outf = AtomicFile(self.cache_file_name(), 'wb')
859 by Martin Pool
- add HashCache.write and a simple test for it
170
        try:
862 by Martin Pool
- code to re-read hashcache from file
171
            print >>outf, CACHE_HEADER,
859 by Martin Pool
- add HashCache.write and a simple test for it
172
860 by Martin Pool
- refactor hashcache to use just one dictionary
173
            for path, c  in self._cache.iteritems():
859 by Martin Pool
- add HashCache.write and a simple test for it
174
                assert '//' not in path, path
175
                outf.write(path.encode('utf-8'))
176
                outf.write('// ')
860 by Martin Pool
- refactor hashcache to use just one dictionary
177
                print >>outf, c[0],     # hex sha1
178
                for fld in c[1]:
862 by Martin Pool
- code to re-read hashcache from file
179
                    print >>outf, "%d" % fld,
859 by Martin Pool
- add HashCache.write and a simple test for it
180
                print >>outf
181
182
            outf.commit()
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
183
            self.needs_write = False
859 by Martin Pool
- add HashCache.write and a simple test for it
184
        finally:
185
            if not outf.closed:
186
                outf.abort()
187
        
862 by Martin Pool
- code to re-read hashcache from file
188
189
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
190
    def read(self):
862 by Martin Pool
- code to re-read hashcache from file
191
        """Reinstate cache from file.
192
193
        Overwrites existing cache.
194
195
        If the cache file has the wrong version marker, this just clears 
196
        the cache."""
197
        from bzrlib.trace import mutter, warning
198
199
        self._cache = {}
200
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
201
        fn = self.cache_file_name()
202
        try:
203
            inf = file(fn, 'rb')
204
        except IOError, e:
205
            mutter("failed to open %s: %s" % (fn, e))
206
            return
207
208
862 by Martin Pool
- code to re-read hashcache from file
209
        hdr = inf.readline()
210
        if hdr != CACHE_HEADER:
211
            mutter('cache header marker not found at top of %s; discarding cache'
878 by Martin Pool
- fix typo
212
                   % fn)
862 by Martin Pool
- code to re-read hashcache from file
213
            return
214
215
        for l in inf:
216
            pos = l.index('// ')
217
            path = l[:pos].decode('utf-8')
218
            if path in self._cache:
219
                warning('duplicated path %r in cache' % path)
220
                continue
221
222
            pos += 3
223
            fields = l[pos:].split(' ')
224
            if len(fields) != 6:
225
                warning("bad line in hashcache: %r" % l)
226
                continue
227
228
            sha1 = fields[0]
229
            if len(sha1) != 40:
230
                warning("bad sha1 in hashcache: %r" % sha1)
231
                continue
232
233
            fp = tuple(map(long, fields[1:]))
234
235
            self._cache[path] = (sha1, fp)
236
866 by Martin Pool
- use new path-based hashcache for WorkingTree- squash mtime/ctime to whole seconds- update and if necessary write out hashcache when WorkingTree object is created.
237
        self.needs_write = False
238
           
239
862 by Martin Pool
- code to re-read hashcache from file
240
241