1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1
18
from cStringIO import StringIO
4
21
class RenameMap(object):
22
"""Determine a mapping of renames."""
7
25
self.edge_hashes = {}
10
28
def iter_edge_hashes(lines):
29
"""Iterate through the hashes of line pairs (which make up an edge)."""
11
30
modulus = 1024 * 1024 * 10
12
31
for n in range(len(lines)):
13
32
yield hash(tuple(lines[n:n+2])) % modulus
15
34
def add_edge_hashes(self, lines, tag):
35
"""Update edge_hashes to include the given lines.
37
:param lines: The lines to update the hashes for.
38
:param tag: A tag uniquely associated with these lines (i.e. file-id)
16
40
for my_hash in self.iter_edge_hashes(lines):
17
41
self.edge_hashes.setdefault(my_hash, set()).add(tag)
19
43
def add_file_edge_hashes(self, tree, file_ids):
44
"""Update to reflect the hashes for files in the tree.
46
:param tree: The tree containing the files.
47
:param file_ids: A list of file_ids to perform the updates for.
20
49
desired_files = [(f, f) for f in file_ids]
21
50
for file_id, contents in tree.iter_files_bytes(desired_files):
25
54
self.add_edge_hashes(s.readlines(), file_id)
27
56
def hitcounts(self, lines):
57
"""Count the number of hash hits for each tag, for the given lines.
59
Hits are weighted according to the number of tags the hash is
60
associated with; more tags means that the lines are not unique and
61
should tend to be ignored.
62
:param lines: The lines to calculate hashes of.
29
65
for my_hash in self.iter_edge_hashes(lines):
30
66
tags = self.edge_hashes.get(my_hash)
40
76
def get_all_hits(self, tree, paths):
77
"""Find all the hit counts for the listed paths in a tree.
79
:return: A list of tuples of count, path, file_id.
43
83
my_file = tree.get_file(None, path=path)