34
34
def iter_edge_hashes(lines):
35
"""Iterate through the hashes of line pairs (which make up an edge)."""
35
"""Iterate through the hashes of line pairs (which make up an edge).
37
The hash is truncated using a modulus to avoid excessive memory
38
consumption by the hitscount dict. A modulus of 10Mi means that the
39
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
36
42
modulus = 1024 * 1024 * 10
37
43
for n in range(len(lines)):
38
44
yield hash(tuple(lines[n:n+2])) % modulus
69
75
"""Count the number of hash hits for each tag, for the given lines.
71
77
Hits are weighted according to the number of tags the hash is
72
associated with; more tags means that the lines are not unique and
73
should tend to be ignored.
78
associated with; more tags means that the hash is less rare and should
74
80
:param lines: The lines to calculate hashes of.
81
:return: a dict of {tag: hitcount}
77
84
for my_hash in self.iter_edge_hashes(lines):
91
98
:return: A list of tuples of count, path, file_id.
94
101
task = ui_factory.nested_progress_bar()
96
103
for num, path in enumerate(paths):
100
107
hits = self.hitcounts(my_file.readlines())
103
ordered_hits.extend((v, path, k) for k, v in hits.items())
110
all_hits.extend((v, path, k) for k, v in hits.items())
108
115
def file_match(self, tree, paths):
109
116
"""Return a mapping from file_ids to the supplied paths."""
110
ordered_hits = self.get_all_hits(tree, paths)
111
ordered_hits.sort(reverse=True)
112
return self._match_hits(ordered_hits)
117
return self._match_hits(self.get_all_hits(tree, paths))
115
def _match_hits(ordered_hits):
120
def _match_hits(hit_list):
121
"""Using a hit list, determin a path-to-fileid map.
123
The hit list is a list of (count, path, file_id), where count is a
124
(possibly float) number, with higher numbers indicating stronger
116
127
seen_file_ids = set()
117
128
seen_paths = set()
119
for count, path, file_id in ordered_hits:
130
for count, path, file_id in sorted(hit_list, reverse=True):
120
131
if path in seen_paths or file_id in seen_file_ids:
122
133
path_map[path] = file_id
144
160
return require_ids
146
162
def match_parents(self, required_parents, missing_parents):
163
"""Map parent directories to file-ids.
165
This is done by finding similarity between the file-ids of children of
166
required parent directories and the file-ids of children of missing
148
170
for file_id, file_id_children in missing_parents.iteritems():
149
171
for path, path_children in required_parents.iteritems():
150
172
hits = len(path_children.intersection(file_id_children))
152
ordered_hits.append((hits, path, file_id))
153
ordered_hits.sort(reverse=True)
154
return self._match_hits(ordered_hits)
174
all_hits.append((hits, path, file_id))
175
return self._match_hits(all_hits)
157
178
def guess_renames(tree):