1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
from cStringIO import StringIO
24
from bzrlib.ui import ui_factory
27
class RenameMap(object):
28
"""Determine a mapping of renames."""
30
def __init__(self, tree):
35
def iter_edge_hashes(lines):
36
"""Iterate through the hashes of line pairs (which make up an edge).
38
The hash is truncated using a modulus to avoid excessive memory
39
consumption by the hitscount dict. A modulus of 10Mi means that the
40
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
43
modulus = 1024 * 1024 * 10
44
for n in range(len(lines)):
45
yield hash(tuple(lines[n:n+2])) % modulus
47
def add_edge_hashes(self, lines, tag):
48
"""Update edge_hashes to include the given lines.
50
:param lines: The lines to update the hashes for.
51
:param tag: A tag uniquely associated with these lines (i.e. file-id)
53
for my_hash in self.iter_edge_hashes(lines):
54
self.edge_hashes.setdefault(my_hash, set()).add(tag)
56
def add_file_edge_hashes(self, tree, file_ids):
57
"""Update to reflect the hashes for files in the tree.
59
:param tree: The tree containing the files.
60
:param file_ids: A list of file_ids to perform the updates for.
62
desired_files = [(f, f) for f in file_ids]
63
task = ui_factory.nested_progress_bar()
65
for num, (file_id, contents) in enumerate(
66
tree.iter_files_bytes(desired_files)):
67
task.update('Calculating hashes', num, len(file_ids))
69
s.writelines(contents)
71
self.add_edge_hashes(s.readlines(), file_id)
75
def hitcounts(self, lines):
76
"""Count the number of hash hits for each tag, for the given lines.
78
Hits are weighted according to the number of tags the hash is
79
associated with; more tags means that the hash is less rare and should
81
:param lines: The lines to calculate hashes of.
82
:return: a dict of {tag: hitcount}
85
for my_hash in self.iter_edge_hashes(lines):
86
tags = self.edge_hashes.get(my_hash)
93
hits[tag] += 1.0 / taglen
96
def get_all_hits(self, paths):
97
"""Find all the hit counts for the listed paths in the tree.
99
:return: A list of tuples of count, path, file_id.
102
task = ui_factory.nested_progress_bar()
104
for num, path in enumerate(paths):
105
task.update('Determining hash hits', num, len(paths))
106
hits = self.hitcounts(self.tree.get_file_lines(None,
108
all_hits.extend((v, path, k) for k, v in hits.items())
113
def file_match(self, paths):
114
"""Return a mapping from file_ids to the supplied paths."""
115
return self._match_hits(self.get_all_hits(paths))
118
def _match_hits(hit_list):
119
"""Using a hit list, determine a path-to-fileid map.
121
The hit list is a list of (count, path, file_id), where count is a
122
(possibly float) number, with higher numbers indicating stronger
125
seen_file_ids = set()
127
for count, path, file_id in sorted(hit_list, reverse=True):
128
if path in path_map or file_id in seen_file_ids:
130
path_map[path] = file_id
131
seen_file_ids.add(file_id)
134
def get_required_parents(self, matches):
135
"""Return a dict of all file parents that must be versioned.
137
The keys are the required parents and the values are sets of their
140
required_parents = {}
144
path = osutils.dirname(path)
145
if self.tree.path2id(path) is not None:
147
required_parents.setdefault(path, []).append(child)
149
for parent, children in required_parents.iteritems():
150
child_file_ids = set()
151
for child in children:
152
file_id = matches.get(child)
153
if file_id is not None:
154
child_file_ids.add(file_id)
155
require_ids[parent] = child_file_ids
158
def match_parents(self, required_parents, missing_parents):
159
"""Map parent directories to file-ids.
161
This is done by finding similarity between the file-ids of children of
162
required parent directories and the file-ids of children of missing
166
for file_id, file_id_children in missing_parents.iteritems():
167
for path, path_children in required_parents.iteritems():
168
hits = len(path_children.intersection(file_id_children))
170
all_hits.append((hits, path, file_id))
171
return self._match_hits(all_hits)
173
def _find_missing_files(self, basis):
174
missing_files = set()
176
candidate_files = set()
177
task = ui_factory.nested_progress_bar()
178
iterator = self.tree.iter_changes(basis, want_unversioned=True,
181
for (file_id, paths, changed_content, versioned, parent, name,
182
kind, executable) in iterator:
183
if kind[1] is None and versioned[1]:
184
missing_parents.setdefault(parent[0], set()).add(file_id)
185
if kind[0] == 'file':
186
missing_files.add(file_id)
188
#other kinds are not handled
190
if versioned == (False, False):
191
if self.tree.is_ignored(paths[1]):
193
if kind[1] == 'file':
194
candidate_files.add(paths[1])
195
if kind[1] == 'directory':
196
for _dir, children in self.tree.walkdirs(paths[1]):
197
for child in children:
198
if child[2] == 'file':
199
candidate_files.add(child[0])
202
return missing_files, missing_parents, candidate_files
205
def guess_renames(klass, tree):
206
"""Guess which files to rename, and perform the rename.
208
We assume that unversioned files and missing files indicate that
209
versioned files have been renamed outside of Bazaar.
211
:param tree: A write-locked working tree.
213
required_parents = {}
214
task = ui_factory.nested_progress_bar()
216
pp = progress.ProgressPhase('Guessing renames', 4, task)
217
basis = tree.basis_tree()
222
missing_files, missing_parents, candidate_files = (
223
rn._find_missing_files(basis))
225
rn.add_file_edge_hashes(basis, missing_files)
229
matches = rn.file_match(candidate_files)
230
parents_matches = matches
231
while len(parents_matches) > 0:
232
required_parents = rn.get_required_parents(
234
parents_matches = rn.match_parents(required_parents,
236
matches.update(parents_matches)
238
rn._update_tree(required_parents, matches)
242
def _update_tree(self, required_parents, matches):
243
self.tree.add(required_parents)
245
file_id_matches = dict((f, p) for p, f in matches.items())
246
for old_path, entry in self.tree.iter_entries_by_dir(matches.values()):
247
new_path = file_id_matches[entry.file_id]
248
parent_path, new_name = osutils.split(new_path)
249
parent_id = matches.get(parent_path)
250
if parent_id is None:
251
parent_id = self.tree.path2id(parent_path)
252
new_entry = entry.copy()
253
new_entry.parent_id = parent_id
254
new_entry.name = new_name
255
delta.append((old_path, new_path, new_entry.file_id, new_entry))
256
self.tree.apply_inventory_delta(delta)