2
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
from bisect import bisect
22
from bzrlib.trace import mutter
25
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
28
def unique_lcs_py(a, b):
29
"""Find the longest common subset for unique lines.
31
:param a: An indexable object (such as string or list of strings)
32
:param b: Another indexable object (such as string or list of strings)
33
:return: A list of tuples, one for each line which is matched.
34
[(line_in_a, line_in_b), ...]
36
This only matches lines which are unique on both sides.
37
This helps prevent common lines from over influencing match
39
The longest common subset uses the Patience Sorting algorithm:
40
http://en.wikipedia.org/wiki/Patience_sorting
42
# set index[line in a] = position of line in a unless
43
# a is a duplicate, in which case it's set to None
45
for i in xrange(len(a)):
51
# make btoa[i] = position of line i in a, unless
52
# that line doesn't occur exactly once in both,
53
# in which case it's set to None
54
btoa = [None] * len(b)
56
for pos, line in enumerate(b):
57
next = index.get(line)
60
# unset the previous mapping, which we now know to
61
# be invalid because the line isn't unique
62
btoa[index2[line]] = None
67
# this is the Patience sorting algorithm
68
# see http://en.wikipedia.org/wiki/Patience_sorting
69
backpointers = [None] * len(b)
73
for bpos, apos in enumerate(btoa):
76
# as an optimization, check if the next line comes at the end,
77
# because it usually does
78
if stacks and stacks[-1] < apos:
80
# as an optimization, check if the next line comes right after
81
# the previous line, because usually it does
82
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
86
k = bisect(stacks, apos)
88
backpointers[bpos] = lasts[k-1]
100
result.append((btoa[k], k))
106
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
107
"""Find all of the matching text in the lines of a and b.
110
:param b: Another sequence
111
:param alo: The start location of a to check, typically 0
112
:param ahi: The start location of b to check, typically 0
113
:param ahi: The maximum length of a to check, typically len(a)
114
:param bhi: The maximum length of b to check, typically len(b)
115
:param answer: The return array. Will be filled with tuples
116
indicating [(line_in_a, line_in_b)]
117
:param maxrecursion: The maximum depth to recurse.
118
Must be a positive integer.
119
:return: None, the return value is in the parameter answer, which
124
mutter('max recursion depth reached')
125
# this will never happen normally, this check is to prevent DOS attacks
127
oldlength = len(answer)
128
if alo == ahi or blo == bhi:
132
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]):
133
# recurse between lines which are unique in each file and match
136
# Most of the time, you will have a sequence of similar entries
137
if last_a_pos+1 != apos or last_b_pos+1 != bpos:
138
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
139
apos, bpos, answer, maxrecursion - 1)
142
answer.append((apos, bpos))
143
if len(answer) > oldlength:
144
# find matches between the last match and the end
145
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
146
ahi, bhi, answer, maxrecursion - 1)
147
elif a[alo] == b[blo]:
148
# find matching lines at the very beginning
149
while alo < ahi and blo < bhi and a[alo] == b[blo]:
150
answer.append((alo, blo))
153
recurse_matches_py(a, b, alo, blo,
154
ahi, bhi, answer, maxrecursion - 1)
155
elif a[ahi - 1] == b[bhi - 1]:
156
# find matching lines at the very end
159
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
162
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,
163
nahi, nbhi, answer, maxrecursion - 1)
164
for i in xrange(ahi - nahi):
165
answer.append((nahi + i, nbhi + i))
168
def _collapse_sequences(matches):
169
"""Find sequences of lines.
171
Given a sequence of [(line_in_a, line_in_b),]
172
find regions where they both increment at the same time
175
start_a = start_b = None
177
for i_a, i_b in matches:
178
if (start_a is not None
179
and (i_a == start_a + length)
180
and (i_b == start_b + length)):
183
if start_a is not None:
184
answer.append((start_a, start_b, length))
190
answer.append((start_a, start_b, length))
195
def _check_consistency(answer):
196
# For consistency sake, make sure all matches are only increasing
199
for (a, b, match_len) in answer:
201
raise ValueError('Non increasing matches for a')
203
raise ValueError('Non increasing matches for b')
204
next_a = a + match_len
205
next_b = b + match_len
208
class PatienceSequenceMatcher_py(difflib.SequenceMatcher):
209
"""Compare a pair of sequences using longest common subset."""
211
_do_check_consistency = True
213
def __init__(self, isjunk=None, a='', b=''):
214
if isjunk is not None:
215
raise NotImplementedError('Currently we do not support'
216
' isjunk for sequence matching')
217
difflib.SequenceMatcher.__init__(self, isjunk, a, b)
219
def get_matching_blocks(self):
220
"""Return list of triples describing matching subsequences.
222
Each triple is of the form (i, j, n), and means that
223
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
226
The last triple is a dummy, (len(a), len(b), 0), and is the only
229
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
230
>>> s.get_matching_blocks()
231
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
233
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
234
# implementation which uses __helper. 2.4.3 got rid of helper for
235
# doing it inline with a queue.
236
# We should consider doing the same for recurse_matches
238
if self.matching_blocks is not None:
239
return self.matching_blocks
242
recurse_matches_py(self.a, self.b, 0, 0,
243
len(self.a), len(self.b), matches, 10)
244
# Matches now has individual line pairs of
245
# line A matches line B, at the given offsets
246
self.matching_blocks = _collapse_sequences(matches)
247
self.matching_blocks.append( (len(self.a), len(self.b), 0) )
248
if PatienceSequenceMatcher_py._do_check_consistency:
250
_check_consistency(self.matching_blocks)
252
return self.matching_blocks