1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
from cStringIO import StringIO
25
from bzrlib.ui import ui_factory
28
class RenameMap(object):
29
"""Determine a mapping of renames."""
31
def __init__(self, tree):
36
def iter_edge_hashes(lines):
37
"""Iterate through the hashes of line pairs (which make up an edge).
39
The hash is truncated using a modulus to avoid excessive memory
40
consumption by the hitscount dict. A modulus of 10Mi means that the
41
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
44
modulus = 1024 * 1024 * 10
45
for n in range(len(lines)):
46
yield hash(tuple(lines[n:n+2])) % modulus
48
def add_edge_hashes(self, lines, tag):
49
"""Update edge_hashes to include the given lines.
51
:param lines: The lines to update the hashes for.
52
:param tag: A tag uniquely associated with these lines (i.e. file-id)
54
for my_hash in self.iter_edge_hashes(lines):
55
self.edge_hashes.setdefault(my_hash, set()).add(tag)
57
def add_file_edge_hashes(self, tree, file_ids):
58
"""Update to reflect the hashes for files in the tree.
60
:param tree: The tree containing the files.
61
:param file_ids: A list of file_ids to perform the updates for.
63
desired_files = [(f, f) for f in file_ids]
64
task = ui_factory.nested_progress_bar()
66
for num, (file_id, contents) in enumerate(
67
tree.iter_files_bytes(desired_files)):
68
task.update('Calculating hashes', num, len(file_ids))
70
s.writelines(contents)
72
self.add_edge_hashes(s.readlines(), file_id)
76
def hitcounts(self, lines):
77
"""Count the number of hash hits for each tag, for the given lines.
79
Hits are weighted according to the number of tags the hash is
80
associated with; more tags means that the hash is less rare and should
82
:param lines: The lines to calculate hashes of.
83
:return: a dict of {tag: hitcount}
86
for my_hash in self.iter_edge_hashes(lines):
87
tags = self.edge_hashes.get(my_hash)
94
hits[tag] += 1.0 / taglen
97
def get_all_hits(self, paths):
98
"""Find all the hit counts for the listed paths in the tree.
100
:return: A list of tuples of count, path, file_id.
103
task = ui_factory.nested_progress_bar()
105
for num, path in enumerate(paths):
106
task.update('Determining hash hits', num, len(paths))
107
hits = self.hitcounts(self.tree.get_file_lines(None,
109
all_hits.extend((v, path, k) for k, v in hits.items())
114
def file_match(self, paths):
115
"""Return a mapping from file_ids to the supplied paths."""
116
return self._match_hits(self.get_all_hits(paths))
119
def _match_hits(hit_list):
120
"""Using a hit list, determine a path-to-fileid map.
122
The hit list is a list of (count, path, file_id), where count is a
123
(possibly float) number, with higher numbers indicating stronger
126
seen_file_ids = set()
128
for count, path, file_id in sorted(hit_list, reverse=True):
129
if path in path_map or file_id in seen_file_ids:
131
path_map[path] = file_id
132
seen_file_ids.add(file_id)
135
def get_required_parents(self, matches):
136
"""Return a dict of all file parents that must be versioned.
138
The keys are the required parents and the values are sets of their
141
required_parents = {}
145
path = osutils.dirname(path)
146
if self.tree.path2id(path) is not None:
148
required_parents.setdefault(path, []).append(child)
150
for parent, children in required_parents.iteritems():
151
child_file_ids = set()
152
for child in children:
153
file_id = matches.get(child)
154
if file_id is not None:
155
child_file_ids.add(file_id)
156
require_ids[parent] = child_file_ids
159
def match_parents(self, required_parents, missing_parents):
160
"""Map parent directories to file-ids.
162
This is done by finding similarity between the file-ids of children of
163
required parent directories and the file-ids of children of missing
167
for file_id, file_id_children in missing_parents.iteritems():
168
for path, path_children in required_parents.iteritems():
169
hits = len(path_children.intersection(file_id_children))
171
all_hits.append((hits, path, file_id))
172
return self._match_hits(all_hits)
174
def _find_missing_files(self, basis):
175
missing_files = set()
177
candidate_files = set()
178
task = ui_factory.nested_progress_bar()
179
iterator = self.tree.iter_changes(basis, want_unversioned=True,
182
for (file_id, paths, changed_content, versioned, parent, name,
183
kind, executable) in iterator:
184
if kind[1] is None and versioned[1]:
185
missing_parents.setdefault(parent[0], set()).add(file_id)
186
if kind[0] == 'file':
187
missing_files.add(file_id)
189
#other kinds are not handled
191
if versioned == (False, False):
192
if self.tree.is_ignored(paths[1]):
194
if kind[1] == 'file':
195
candidate_files.add(paths[1])
196
if kind[1] == 'directory':
197
for _dir, children in self.tree.walkdirs(paths[1]):
198
for child in children:
199
if child[2] == 'file':
200
candidate_files.add(child[0])
203
return missing_files, missing_parents, candidate_files
206
def guess_renames(klass, tree, dry_run=False):
207
"""Guess which files to rename, and perform the rename.
209
We assume that unversioned files and missing files indicate that
210
versioned files have been renamed outside of Bazaar.
212
:param tree: A write-locked working tree.
214
required_parents = {}
215
task = ui_factory.nested_progress_bar()
217
pp = progress.ProgressPhase('Guessing renames', 4, task)
218
basis = tree.basis_tree()
223
missing_files, missing_parents, candidate_files = (
224
rn._find_missing_files(basis))
226
rn.add_file_edge_hashes(basis, missing_files)
230
matches = rn.file_match(candidate_files)
231
parents_matches = matches
232
while len(parents_matches) > 0:
233
required_parents = rn.get_required_parents(
235
parents_matches = rn.match_parents(required_parents,
237
matches.update(parents_matches)
239
delta = rn._make_inventory_delta(matches)
240
for old, new, file_id, entry in delta:
241
trace.note("%s => %s", old, new)
243
tree.add(required_parents)
244
tree.apply_inventory_delta(delta)
248
def _make_inventory_delta(self, matches):
250
file_id_matches = dict((f, p) for p, f in matches.items())
251
for old_path, entry in self.tree.iter_entries_by_dir(matches.values()):
252
new_path = file_id_matches[entry.file_id]
253
parent_path, new_name = osutils.split(new_path)
254
parent_id = matches.get(parent_path)
255
if parent_id is None:
256
parent_id = self.tree.path2id(parent_path)
257
if entry.name == new_name and entry.parent_id == parent_id:
259
new_entry = entry.copy()
260
new_entry.parent_id = parent_id
261
new_entry.name = new_name
262
delta.append((old_path, new_path, new_entry.file_id, new_entry))