~bzr-pqm/bzr/bzr.dev

3193.8.13 by Aaron Bentley
Update texts
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
3193.8.32 by Aaron Bentley
Update GPL preamble
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3193.8.13 by Aaron Bentley
Update texts
16
17
3193.8.4 by Aaron Bentley
Get rename detection working for files.
18
from cStringIO import StringIO
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
19
20
from bzrlib import (
21
    osutils,
22
    progress,
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
23
    trace,
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
24
)
3193.8.16 by Aaron Bentley
Get a dict of required parents.
25
from bzrlib.ui import ui_factory
3193.8.4 by Aaron Bentley
Get rename detection working for files.
26
27
28
class RenameMap(object):
3193.8.13 by Aaron Bentley
Update texts
29
    """Determine a mapping of renames."""
3193.8.4 by Aaron Bentley
Get rename detection working for files.
30
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
31
    def __init__(self, tree):
32
        self.tree = tree
3193.8.4 by Aaron Bentley
Get rename detection working for files.
33
        self.edge_hashes = {}
34
3193.8.11 by Aaron Bentley
Make hash iterator static
35
    @staticmethod
36
    def iter_edge_hashes(lines):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
37
        """Iterate through the hashes of line pairs (which make up an edge).
38
39
        The hash is truncated using a modulus to avoid excessive memory
40
        consumption by the hitscount dict.  A modulus of 10Mi means that the
41
        maximum number of keys is 10Mi.  (Keys are normally 32 bits, e.g.
42
        4 Gi)
43
        """
3193.8.10 by Aaron Bentley
Update to weight hits and use 10M of keyspace
44
        modulus = 1024 * 1024 * 10
3193.8.4 by Aaron Bentley
Get rename detection working for files.
45
        for n in range(len(lines)):
3193.8.10 by Aaron Bentley
Update to weight hits and use 10M of keyspace
46
            yield hash(tuple(lines[n:n+2])) % modulus
3193.8.4 by Aaron Bentley
Get rename detection working for files.
47
48
    def add_edge_hashes(self, lines, tag):
3193.8.13 by Aaron Bentley
Update texts
49
        """Update edge_hashes to include the given lines.
50
51
        :param lines: The lines to update the hashes for.
52
        :param tag: A tag uniquely associated with these lines (i.e. file-id)
53
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
54
        for my_hash in self.iter_edge_hashes(lines):
55
            self.edge_hashes.setdefault(my_hash, set()).add(tag)
56
57
    def add_file_edge_hashes(self, tree, file_ids):
3193.8.13 by Aaron Bentley
Update texts
58
        """Update to reflect the hashes for files in the tree.
59
60
        :param tree: The tree containing the files.
61
        :param file_ids: A list of file_ids to perform the updates for.
62
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
63
        desired_files = [(f, f) for f in file_ids]
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
64
        task = ui_factory.nested_progress_bar()
65
        try:
66
            for num, (file_id, contents) in enumerate(
67
                tree.iter_files_bytes(desired_files)):
68
                task.update('Calculating hashes', num, len(file_ids))
69
                s = StringIO()
70
                s.writelines(contents)
71
                s.seek(0)
72
                self.add_edge_hashes(s.readlines(), file_id)
73
        finally:
74
            task.finished()
3193.8.4 by Aaron Bentley
Get rename detection working for files.
75
76
    def hitcounts(self, lines):
3193.8.13 by Aaron Bentley
Update texts
77
        """Count the number of hash hits for each tag, for the given lines.
78
79
        Hits are weighted according to the number of tags the hash is
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
80
        associated with; more tags means that the hash is less rare and should
81
        tend to be ignored.
3193.8.13 by Aaron Bentley
Update texts
82
        :param lines: The lines to calculate hashes of.
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
83
        :return: a dict of {tag: hitcount}
3193.8.13 by Aaron Bentley
Update texts
84
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
85
        hits = {}
86
        for my_hash in self.iter_edge_hashes(lines):
87
            tags = self.edge_hashes.get(my_hash)
88
            if tags is None:
89
                continue
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
90
            taglen = len(tags)
3193.8.4 by Aaron Bentley
Get rename detection working for files.
91
            for tag in tags:
92
                if tag not in hits:
93
                    hits[tag] = 0
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
94
                hits[tag] += 1.0 / taglen
3193.8.4 by Aaron Bentley
Get rename detection working for files.
95
        return hits
96
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
97
    def get_all_hits(self, paths):
98
        """Find all the hit counts for the listed paths in the tree.
3193.8.13 by Aaron Bentley
Update texts
99
100
        :return: A list of tuples of count, path, file_id.
101
        """
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
102
        all_hits = []
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
103
        task = ui_factory.nested_progress_bar()
104
        try:
105
            for num, path in enumerate(paths):
106
                task.update('Determining hash hits', num, len(paths))
3193.8.26 by Aaron Bentley
Updates from review.
107
                hits = self.hitcounts(self.tree.get_file_lines(None,
108
                                                               path=path))
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
109
                all_hits.extend((v, path, k) for k, v in hits.items())
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
110
        finally:
111
            task.finished()
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
112
        return all_hits
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
113
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
114
    def file_match(self, paths):
3193.8.13 by Aaron Bentley
Update texts
115
        """Return a mapping from file_ids to the supplied paths."""
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
116
        return self._match_hits(self.get_all_hits(paths))
3193.8.17 by Aaron Bentley
Get directory rename handling working.
117
118
    @staticmethod
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
119
    def _match_hits(hit_list):
3193.8.26 by Aaron Bentley
Updates from review.
120
        """Using a hit list, determine a path-to-fileid map.
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
121
122
        The hit list is a list of (count, path, file_id), where count is a
123
        (possibly float) number, with higher numbers indicating stronger
124
        matches.
125
        """
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
126
        seen_file_ids = set()
127
        path_map = {}
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
128
        for count, path, file_id in sorted(hit_list, reverse=True):
3193.8.26 by Aaron Bentley
Updates from review.
129
            if path in path_map or file_id in seen_file_ids:
3193.8.7 by Aaron Bentley
Saner algorithm for picking optimal file.
130
                continue
131
            path_map[path] = file_id
132
            seen_file_ids.add(file_id)
3193.8.4 by Aaron Bentley
Get rename detection working for files.
133
        return path_map
3193.8.16 by Aaron Bentley
Get a dict of required parents.
134
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
135
    def get_required_parents(self, matches):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
136
        """Return a dict of all file parents that must be versioned.
137
138
        The keys are the required parents and the values are sets of their
139
        children.
140
        """
3193.8.16 by Aaron Bentley
Get a dict of required parents.
141
        required_parents = {}
142
        for path in matches:
143
            while True:
144
                child = path
145
                path = osutils.dirname(path)
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
146
                if self.tree.path2id(path) is not None:
3193.8.16 by Aaron Bentley
Get a dict of required parents.
147
                    break
148
                required_parents.setdefault(path, []).append(child)
3193.8.17 by Aaron Bentley
Get directory rename handling working.
149
        require_ids = {}
150
        for parent, children in required_parents.iteritems():
151
            child_file_ids = set()
152
            for child in children:
153
                file_id = matches.get(child)
154
                if file_id is not None:
155
                    child_file_ids.add(file_id)
156
            require_ids[parent] = child_file_ids
157
        return require_ids
158
159
    def match_parents(self, required_parents, missing_parents):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
160
        """Map parent directories to file-ids.
161
162
        This is done by finding similarity between the file-ids of children of
163
        required parent directories and the file-ids of children of missing
164
        parent directories.
165
        """
166
        all_hits = []
3193.8.17 by Aaron Bentley
Get directory rename handling working.
167
        for file_id, file_id_children in missing_parents.iteritems():
168
            for path, path_children in required_parents.iteritems():
169
                hits = len(path_children.intersection(file_id_children))
170
                if hits > 0:
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
171
                    all_hits.append((hits, path, file_id))
172
        return self._match_hits(all_hits)
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
173
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
174
    def _find_missing_files(self, basis):
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
175
        missing_files = set()
176
        missing_parents = {}
177
        candidate_files = set()
3193.8.25 by Aaron Bentley
Improve progress reporting.
178
        task = ui_factory.nested_progress_bar()
179
        iterator = self.tree.iter_changes(basis, want_unversioned=True,
180
                                          pb=task)
181
        try:
182
            for (file_id, paths, changed_content, versioned, parent, name,
183
                 kind, executable) in iterator:
184
                if kind[1] is None and versioned[1]:
185
                    missing_parents.setdefault(parent[0], set()).add(file_id)
186
                    if kind[0] == 'file':
187
                        missing_files.add(file_id)
188
                    else:
189
                        #other kinds are not handled
190
                        pass
191
                if versioned == (False, False):
192
                    if self.tree.is_ignored(paths[1]):
193
                        continue
194
                    if kind[1] == 'file':
195
                        candidate_files.add(paths[1])
196
                    if kind[1] == 'directory':
197
                        for _dir, children in self.tree.walkdirs(paths[1]):
198
                            for child in children:
199
                                if child[2] == 'file':
200
                                    candidate_files.add(child[0])
201
        finally:
202
            task.finished()
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
203
        return missing_files, missing_parents, candidate_files
204
205
    @classmethod
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
206
    def guess_renames(klass, tree, dry_run=False):
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
207
        """Guess which files to rename, and perform the rename.
208
209
        We assume that unversioned files and missing files indicate that
210
        versioned files have been renamed outside of Bazaar.
3193.8.26 by Aaron Bentley
Updates from review.
211
212
        :param tree: A write-locked working tree.
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
213
        """
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
214
        required_parents = {}
3193.8.25 by Aaron Bentley
Improve progress reporting.
215
        task = ui_factory.nested_progress_bar()
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
216
        try:
3193.8.25 by Aaron Bentley
Improve progress reporting.
217
            pp = progress.ProgressPhase('Guessing renames', 4, task)
218
            basis = tree.basis_tree()
219
            basis.lock_read()
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
220
            try:
3193.8.25 by Aaron Bentley
Improve progress reporting.
221
                rn = klass(tree)
222
                pp.next_phase()
223
                missing_files, missing_parents, candidate_files = (
224
                    rn._find_missing_files(basis))
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
225
                pp.next_phase()
226
                rn.add_file_edge_hashes(basis, missing_files)
227
            finally:
3193.8.25 by Aaron Bentley
Improve progress reporting.
228
                basis.unlock()
229
            pp.next_phase()
230
            matches = rn.file_match(candidate_files)
231
            parents_matches = matches
232
            while len(parents_matches) > 0:
233
                required_parents = rn.get_required_parents(
234
                    parents_matches)
235
                parents_matches = rn.match_parents(required_parents,
236
                                                   missing_parents)
237
                matches.update(parents_matches)
238
            pp.next_phase()
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
239
            delta = rn._make_inventory_delta(matches)
240
            for old, new, file_id, entry in delta:
241
                trace.note("%s => %s", old, new)
242
            if not dry_run:
243
                tree.add(required_parents)
244
                tree.apply_inventory_delta(delta)
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
245
        finally:
3193.8.25 by Aaron Bentley
Improve progress reporting.
246
            task.finished()
3193.8.23 by Aaron Bentley
Split up guess_renames further.
247
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
248
    def _make_inventory_delta(self, matches):
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
249
        delta = []
250
        file_id_matches = dict((f, p) for p, f in matches.items())
251
        for old_path, entry in self.tree.iter_entries_by_dir(matches.values()):
252
            new_path = file_id_matches[entry.file_id]
3193.8.29 by Aaron Bentley
Use split instead of basename/dirname
253
            parent_path, new_name = osutils.split(new_path)
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
254
            parent_id = matches.get(parent_path)
255
            if parent_id is None:
256
                parent_id = self.tree.path2id(parent_path)
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
257
            if entry.name == new_name and entry.parent_id == parent_id:
258
                continue
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
259
            new_entry = entry.copy()
260
            new_entry.parent_id = parent_id
3193.8.29 by Aaron Bentley
Use split instead of basename/dirname
261
            new_entry.name = new_name
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
262
            delta.append((old_path, new_path, new_entry.file_id, new_entry))
3193.8.33 by Aaron Bentley
Add output, emit minimal inventory delta.
263
        return delta