2
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
from __future__ import absolute_import
20
from bisect import bisect
23
from bzrlib.trace import mutter
26
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
29
def unique_lcs_py(a, b):
30
"""Find the longest common subset for unique lines.
32
:param a: An indexable object (such as string or list of strings)
33
:param b: Another indexable object (such as string or list of strings)
34
:return: A list of tuples, one for each line which is matched.
35
[(line_in_a, line_in_b), ...]
37
This only matches lines which are unique on both sides.
38
This helps prevent common lines from over influencing match
40
The longest common subset uses the Patience Sorting algorithm:
41
http://en.wikipedia.org/wiki/Patience_sorting
43
# set index[line in a] = position of line in a unless
44
# a is a duplicate, in which case it's set to None
46
for i in xrange(len(a)):
52
# make btoa[i] = position of line i in a, unless
53
# that line doesn't occur exactly once in both,
54
# in which case it's set to None
55
btoa = [None] * len(b)
57
for pos, line in enumerate(b):
58
next = index.get(line)
61
# unset the previous mapping, which we now know to
62
# be invalid because the line isn't unique
63
btoa[index2[line]] = None
68
# this is the Patience sorting algorithm
69
# see http://en.wikipedia.org/wiki/Patience_sorting
70
backpointers = [None] * len(b)
74
for bpos, apos in enumerate(btoa):
77
# as an optimization, check if the next line comes at the end,
78
# because it usually does
79
if stacks and stacks[-1] < apos:
81
# as an optimization, check if the next line comes right after
82
# the previous line, because usually it does
83
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
87
k = bisect(stacks, apos)
89
backpointers[bpos] = lasts[k-1]
101
result.append((btoa[k], k))
107
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
108
"""Find all of the matching text in the lines of a and b.
111
:param b: Another sequence
112
:param alo: The start location of a to check, typically 0
113
:param ahi: The start location of b to check, typically 0
114
:param ahi: The maximum length of a to check, typically len(a)
115
:param bhi: The maximum length of b to check, typically len(b)
116
:param answer: The return array. Will be filled with tuples
117
indicating [(line_in_a, line_in_b)]
118
:param maxrecursion: The maximum depth to recurse.
119
Must be a positive integer.
120
:return: None, the return value is in the parameter answer, which
125
mutter('max recursion depth reached')
126
# this will never happen normally, this check is to prevent DOS attacks
128
oldlength = len(answer)
129
if alo == ahi or blo == bhi:
133
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]):
134
# recurse between lines which are unique in each file and match
137
# Most of the time, you will have a sequence of similar entries
138
if last_a_pos+1 != apos or last_b_pos+1 != bpos:
139
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
140
apos, bpos, answer, maxrecursion - 1)
143
answer.append((apos, bpos))
144
if len(answer) > oldlength:
145
# find matches between the last match and the end
146
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
147
ahi, bhi, answer, maxrecursion - 1)
148
elif a[alo] == b[blo]:
149
# find matching lines at the very beginning
150
while alo < ahi and blo < bhi and a[alo] == b[blo]:
151
answer.append((alo, blo))
154
recurse_matches_py(a, b, alo, blo,
155
ahi, bhi, answer, maxrecursion - 1)
156
elif a[ahi - 1] == b[bhi - 1]:
157
# find matching lines at the very end
160
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
163
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
164
nahi, nbhi, answer, maxrecursion - 1)
165
for i in xrange(ahi - nahi):
166
answer.append((nahi + i, nbhi + i))
169
def _collapse_sequences(matches):
170
"""Find sequences of lines.
172
Given a sequence of [(line_in_a, line_in_b),]
173
find regions where they both increment at the same time
176
start_a = start_b = None
178
for i_a, i_b in matches:
179
if (start_a is not None
180
and (i_a == start_a + length)
181
and (i_b == start_b + length)):
184
if start_a is not None:
185
answer.append((start_a, start_b, length))
191
answer.append((start_a, start_b, length))
196
def _check_consistency(answer):
197
# For consistency sake, make sure all matches are only increasing
200
for (a, b, match_len) in answer:
202
raise ValueError('Non increasing matches for a')
204
raise ValueError('Non increasing matches for b')
205
next_a = a + match_len
206
next_b = b + match_len
209
class PatienceSequenceMatcher_py(difflib.SequenceMatcher):
210
"""Compare a pair of sequences using longest common subset."""
212
_do_check_consistency = True
214
def __init__(self, isjunk=None, a='', b=''):
215
if isjunk is not None:
216
raise NotImplementedError('Currently we do not support'
217
' isjunk for sequence matching')
218
difflib.SequenceMatcher.__init__(self, isjunk, a, b)
220
def get_matching_blocks(self):
221
"""Return list of triples describing matching subsequences.
223
Each triple is of the form (i, j, n), and means that
224
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
227
The last triple is a dummy, (len(a), len(b), 0), and is the only
230
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
231
>>> s.get_matching_blocks()
232
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
234
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
235
# implementation which uses __helper. 2.4.3 got rid of helper for
236
# doing it inline with a queue.
237
# We should consider doing the same for recurse_matches
239
if self.matching_blocks is not None:
240
return self.matching_blocks
243
recurse_matches_py(self.a, self.b, 0, 0,
244
len(self.a), len(self.b), matches, 10)
245
# Matches now has individual line pairs of
246
# line A matches line B, at the given offsets
247
self.matching_blocks = _collapse_sequences(matches)
248
self.matching_blocks.append( (len(self.a), len(self.b), 0) )
249
if PatienceSequenceMatcher_py._do_check_consistency:
251
_check_consistency(self.matching_blocks)
253
return self.matching_blocks