1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
from __future__ import absolute_import
19
from cStringIO import StringIO
26
from bzrlib.ui import ui_factory
27
from bzrlib.i18n import gettext
29
class RenameMap(object):
30
"""Determine a mapping of renames."""
32
def __init__(self, tree):
37
def iter_edge_hashes(lines):
38
"""Iterate through the hashes of line pairs (which make up an edge).
40
The hash is truncated using a modulus to avoid excessive memory
41
consumption by the hitscount dict. A modulus of 10Mi means that the
42
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
45
modulus = 1024 * 1024 * 10
46
for n in range(len(lines)):
47
yield hash(tuple(lines[n:n+2])) % modulus
49
def add_edge_hashes(self, lines, tag):
50
"""Update edge_hashes to include the given lines.
52
:param lines: The lines to update the hashes for.
53
:param tag: A tag uniquely associated with these lines (i.e. file-id)
55
for my_hash in self.iter_edge_hashes(lines):
56
self.edge_hashes.setdefault(my_hash, set()).add(tag)
58
def add_file_edge_hashes(self, tree, file_ids):
59
"""Update to reflect the hashes for files in the tree.
61
:param tree: The tree containing the files.
62
:param file_ids: A list of file_ids to perform the updates for.
64
desired_files = [(f, f) for f in file_ids]
65
task = ui_factory.nested_progress_bar()
67
for num, (file_id, contents) in enumerate(
68
tree.iter_files_bytes(desired_files)):
69
task.update(gettext('Calculating hashes'), num, len(file_ids))
71
s.writelines(contents)
73
self.add_edge_hashes(s.readlines(), file_id)
77
def hitcounts(self, lines):
78
"""Count the number of hash hits for each tag, for the given lines.
80
Hits are weighted according to the number of tags the hash is
81
associated with; more tags means that the hash is less rare and should
83
:param lines: The lines to calculate hashes of.
84
:return: a dict of {tag: hitcount}
87
for my_hash in self.iter_edge_hashes(lines):
88
tags = self.edge_hashes.get(my_hash)
95
hits[tag] += 1.0 / taglen
98
def get_all_hits(self, paths):
99
"""Find all the hit counts for the listed paths in the tree.
101
:return: A list of tuples of count, path, file_id.
104
task = ui_factory.nested_progress_bar()
106
for num, path in enumerate(paths):
107
task.update(gettext('Determining hash hits'), num, len(paths))
108
hits = self.hitcounts(self.tree.get_file_lines(None,
110
all_hits.extend((v, path, k) for k, v in hits.items())
115
def file_match(self, paths):
116
"""Return a mapping from file_ids to the supplied paths."""
117
return self._match_hits(self.get_all_hits(paths))
120
def _match_hits(hit_list):
121
"""Using a hit list, determine a path-to-fileid map.
123
The hit list is a list of (count, path, file_id), where count is a
124
(possibly float) number, with higher numbers indicating stronger
127
seen_file_ids = set()
129
for count, path, file_id in sorted(hit_list, reverse=True):
130
if path in path_map or file_id in seen_file_ids:
132
path_map[path] = file_id
133
seen_file_ids.add(file_id)
136
def get_required_parents(self, matches):
137
"""Return a dict of all file parents that must be versioned.
139
The keys are the required parents and the values are sets of their
142
required_parents = {}
146
path = osutils.dirname(path)
147
if self.tree.path2id(path) is not None:
149
required_parents.setdefault(path, []).append(child)
151
for parent, children in required_parents.iteritems():
152
child_file_ids = set()
153
for child in children:
154
file_id = matches.get(child)
155
if file_id is not None:
156
child_file_ids.add(file_id)
157
require_ids[parent] = child_file_ids
160
def match_parents(self, required_parents, missing_parents):
161
"""Map parent directories to file-ids.
163
This is done by finding similarity between the file-ids of children of
164
required parent directories and the file-ids of children of missing
168
for file_id, file_id_children in missing_parents.iteritems():
169
for path, path_children in required_parents.iteritems():
170
hits = len(path_children.intersection(file_id_children))
172
all_hits.append((hits, path, file_id))
173
return self._match_hits(all_hits)
175
def _find_missing_files(self, basis):
176
missing_files = set()
178
candidate_files = set()
179
task = ui_factory.nested_progress_bar()
180
iterator = self.tree.iter_changes(basis, want_unversioned=True,
183
for (file_id, paths, changed_content, versioned, parent, name,
184
kind, executable) in iterator:
185
if kind[1] is None and versioned[1]:
186
missing_parents.setdefault(parent[0], set()).add(file_id)
187
if kind[0] == 'file':
188
missing_files.add(file_id)
190
#other kinds are not handled
192
if versioned == (False, False):
193
if self.tree.is_ignored(paths[1]):
195
if kind[1] == 'file':
196
candidate_files.add(paths[1])
197
if kind[1] == 'directory':
198
for _dir, children in self.tree.walkdirs(paths[1]):
199
for child in children:
200
if child[2] == 'file':
201
candidate_files.add(child[0])
204
return missing_files, missing_parents, candidate_files
207
def guess_renames(klass, tree, dry_run=False):
208
"""Guess which files to rename, and perform the rename.
210
We assume that unversioned files and missing files indicate that
211
versioned files have been renamed outside of Bazaar.
213
:param tree: A write-locked working tree.
215
required_parents = {}
216
task = ui_factory.nested_progress_bar()
218
pp = progress.ProgressPhase('Guessing renames', 4, task)
219
basis = tree.basis_tree()
224
missing_files, missing_parents, candidate_files = (
225
rn._find_missing_files(basis))
227
rn.add_file_edge_hashes(basis, missing_files)
231
matches = rn.file_match(candidate_files)
232
parents_matches = matches
233
while len(parents_matches) > 0:
234
required_parents = rn.get_required_parents(
236
parents_matches = rn.match_parents(required_parents,
238
matches.update(parents_matches)
240
delta = rn._make_inventory_delta(matches)
241
for old, new, file_id, entry in delta:
242
trace.note( gettext("{0} => {1}").format(old, new) )
244
tree.add(required_parents)
245
tree.apply_inventory_delta(delta)
249
def _make_inventory_delta(self, matches):
251
file_id_matches = dict((f, p) for p, f in matches.items())
252
for old_path, entry in self.tree.iter_entries_by_dir(matches.values()):
253
new_path = file_id_matches[entry.file_id]
254
parent_path, new_name = osutils.split(new_path)
255
parent_id = matches.get(parent_path)
256
if parent_id is None:
257
parent_id = self.tree.path2id(parent_path)
258
if entry.name == new_name and entry.parent_id == parent_id:
260
new_entry = entry.copy()
261
new_entry.parent_id = parent_id
262
new_entry.name = new_name
263
delta.append((old_path, new_path, new_entry.file_id, new_entry))