1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
1 |
#!/usr/bin/env python
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
2 |
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
|
3 |
#
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
4 |
# This program is free software; you can redistribute it and/or modify
|
5 |
# it under the terms of the GNU General Public License as published by
|
|
6 |
# the Free Software Foundation; either version 2 of the License, or
|
|
7 |
# (at your option) any later version.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
8 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
9 |
# This program is distributed in the hope that it will be useful,
|
10 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 |
# GNU General Public License for more details.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
13 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
14 |
# You should have received a copy of the GNU General Public License
|
15 |
# along with this program; if not, write to the Free Software
|
|
16 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
17 |
||
18 |
||
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
19 |
from bisect import bisect |
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
20 |
import difflib |
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
21 |
|
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
22 |
from bzrlib.trace import mutter |
23 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
24 |
|
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
25 |
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files'] |
1185.81.9
by John Arbash Meinel
Added (failing) tests for cdv.recurse_matches with common sections, |
26 |
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
27 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
28 |
def unique_lcs_py(a, b): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
29 |
"""Find the longest common subset for unique lines.
|
30 |
||
31 |
:param a: An indexable object (such as string or list of strings)
|
|
32 |
:param b: Another indexable object (such as string or list of strings)
|
|
33 |
:return: A list of tuples, one for each line which is matched.
|
|
34 |
[(line_in_a, line_in_b), ...]
|
|
35 |
||
36 |
This only matches lines which are unique on both sides.
|
|
37 |
This helps prevent common lines from over influencing match
|
|
38 |
results.
|
|
39 |
The longest common subset uses the Patience Sorting algorithm:
|
|
40 |
http://en.wikipedia.org/wiki/Patience_sorting
|
|
41 |
"""
|
|
42 |
# set index[line in a] = position of line in a unless
|
|
2100.2.1
by wang
Replace python's difflib by patiencediff because the worst case |
43 |
# a is a duplicate, in which case it's set to None
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
44 |
index = {} |
45 |
for i in xrange(len(a)): |
|
46 |
line = a[i] |
|
47 |
if line in index: |
|
48 |
index[line] = None |
|
49 |
else: |
|
50 |
index[line]= i |
|
51 |
# make btoa[i] = position of line i in a, unless
|
|
52 |
# that line doesn't occur exactly once in both,
|
|
53 |
# in which case it's set to None
|
|
54 |
btoa = [None] * len(b) |
|
55 |
index2 = {} |
|
56 |
for pos, line in enumerate(b): |
|
57 |
next = index.get(line) |
|
58 |
if next is not None: |
|
59 |
if line in index2: |
|
60 |
# unset the previous mapping, which we now know to
|
|
61 |
# be invalid because the line isn't unique
|
|
62 |
btoa[index2[line]] = None |
|
63 |
del index[line] |
|
64 |
else: |
|
65 |
index2[line] = pos |
|
66 |
btoa[pos] = next |
|
67 |
# this is the Patience sorting algorithm
|
|
68 |
# see http://en.wikipedia.org/wiki/Patience_sorting
|
|
69 |
backpointers = [None] * len(b) |
|
70 |
stacks = [] |
|
71 |
lasts = [] |
|
72 |
k = 0 |
|
73 |
for bpos, apos in enumerate(btoa): |
|
74 |
if apos is None: |
|
75 |
continue
|
|
76 |
# as an optimization, check if the next line comes at the end,
|
|
77 |
# because it usually does
|
|
78 |
if stacks and stacks[-1] < apos: |
|
79 |
k = len(stacks) |
|
80 |
# as an optimization, check if the next line comes right after
|
|
81 |
# the previous line, because usually it does
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
82 |
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or |
83 |
stacks[k+1] > apos): |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
84 |
k += 1 |
85 |
else: |
|
86 |
k = bisect(stacks, apos) |
|
87 |
if k > 0: |
|
88 |
backpointers[bpos] = lasts[k-1] |
|
89 |
if k < len(stacks): |
|
90 |
stacks[k] = apos |
|
91 |
lasts[k] = bpos |
|
92 |
else: |
|
93 |
stacks.append(apos) |
|
94 |
lasts.append(bpos) |
|
95 |
if len(lasts) == 0: |
|
96 |
return [] |
|
97 |
result = [] |
|
98 |
k = lasts[-1] |
|
99 |
while k is not None: |
|
100 |
result.append((btoa[k], k)) |
|
101 |
k = backpointers[k] |
|
102 |
result.reverse() |
|
103 |
return result |
|
104 |
||
105 |
||
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
106 |
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
107 |
"""Find all of the matching text in the lines of a and b.
|
108 |
||
109 |
:param a: A sequence
|
|
110 |
:param b: Another sequence
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
111 |
:param alo: The start location of a to check, typically 0
|
112 |
:param ahi: The start location of b to check, typically 0
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
113 |
:param ahi: The maximum length of a to check, typically len(a)
|
114 |
:param bhi: The maximum length of b to check, typically len(b)
|
|
115 |
:param answer: The return array. Will be filled with tuples
|
|
1711.2.17
by John Arbash Meinel
Small cleanups to patience_diff code. |
116 |
indicating [(line_in_a, line_in_b)]
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
117 |
:param maxrecursion: The maximum depth to recurse.
|
118 |
Must be a positive integer.
|
|
119 |
:return: None, the return value is in the parameter answer, which
|
|
120 |
should be a list
|
|
121 |
||
122 |
"""
|
|
123 |
if maxrecursion < 0: |
|
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
124 |
mutter('max recursion depth reached') |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
125 |
# this will never happen normally, this check is to prevent DOS attacks
|
126 |
return
|
|
127 |
oldlength = len(answer) |
|
128 |
if alo == ahi or blo == bhi: |
|
129 |
return
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
130 |
last_a_pos = alo-1 |
131 |
last_b_pos = blo-1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
132 |
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
133 |
# recurse between lines which are unique in each file and match
|
134 |
apos += alo |
|
135 |
bpos += blo |
|
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
136 |
# Most of the time, you will have a sequence of similar entries
|
137 |
if last_a_pos+1 != apos or last_b_pos+1 != bpos: |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
138 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
139 |
apos, bpos, answer, maxrecursion - 1) |
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
140 |
last_a_pos = apos |
141 |
last_b_pos = bpos |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
142 |
answer.append((apos, bpos)) |
143 |
if len(answer) > oldlength: |
|
144 |
# find matches between the last match and the end
|
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
145 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
146 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
147 |
elif a[alo] == b[blo]: |
148 |
# find matching lines at the very beginning
|
|
149 |
while alo < ahi and blo < bhi and a[alo] == b[blo]: |
|
150 |
answer.append((alo, blo)) |
|
151 |
alo += 1 |
|
152 |
blo += 1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
153 |
recurse_matches_py(a, b, alo, blo, |
154 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
155 |
elif a[ahi - 1] == b[bhi - 1]: |
156 |
# find matching lines at the very end
|
|
157 |
nahi = ahi - 1 |
|
158 |
nbhi = bhi - 1 |
|
159 |
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]: |
|
160 |
nahi -= 1 |
|
161 |
nbhi -= 1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
162 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
163 |
nahi, nbhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
164 |
for i in xrange(ahi - nahi): |
165 |
answer.append((nahi + i, nbhi + i)) |
|
166 |
||
167 |
||
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
168 |
def _collapse_sequences(matches): |
169 |
"""Find sequences of lines.
|
|
170 |
||
171 |
Given a sequence of [(line_in_a, line_in_b),]
|
|
172 |
find regions where they both increment at the same time
|
|
173 |
"""
|
|
174 |
answer = [] |
|
175 |
start_a = start_b = None |
|
176 |
length = 0 |
|
177 |
for i_a, i_b in matches: |
|
178 |
if (start_a is not None |
|
179 |
and (i_a == start_a + length) |
|
180 |
and (i_b == start_b + length)): |
|
181 |
length += 1 |
|
182 |
else: |
|
183 |
if start_a is not None: |
|
184 |
answer.append((start_a, start_b, length)) |
|
185 |
start_a = i_a |
|
186 |
start_b = i_b |
|
187 |
length = 1 |
|
188 |
||
189 |
if length != 0: |
|
190 |
answer.append((start_a, start_b, length)) |
|
191 |
||
192 |
return answer |
|
193 |
||
194 |
||
195 |
def _check_consistency(answer): |
|
196 |
# For consistency sake, make sure all matches are only increasing
|
|
197 |
next_a = -1 |
|
198 |
next_b = -1 |
|
3376.2.8
by Martin Pool
Some review cleanups for assertion removal |
199 |
for (a, b, match_len) in answer: |
3376.2.4
by Martin Pool
Remove every assert statement from bzrlib! |
200 |
if a < next_a: |
201 |
raise ValueError('Non increasing matches for a') |
|
202 |
if b < next_b: |
|
3376.2.8
by Martin Pool
Some review cleanups for assertion removal |
203 |
raise ValueError('Non increasing matches for b') |
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
204 |
next_a = a + match_len |
205 |
next_b = b + match_len |
|
206 |
||
207 |
||
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
208 |
class PatienceSequenceMatcher_py(difflib.SequenceMatcher): |
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
209 |
"""Compare a pair of sequences using longest common subset."""
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
210 |
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
211 |
_do_check_consistency = True |
212 |
||
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
213 |
def __init__(self, isjunk=None, a='', b=''): |
214 |
if isjunk is not None: |
|
215 |
raise NotImplementedError('Currently we do not support' |
|
216 |
' isjunk for sequence matching') |
|
217 |
difflib.SequenceMatcher.__init__(self, isjunk, a, b) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
218 |
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
219 |
def get_matching_blocks(self): |
220 |
"""Return list of triples describing matching subsequences.
|
|
221 |
||
222 |
Each triple is of the form (i, j, n), and means that
|
|
223 |
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
|
|
224 |
i and in j.
|
|
225 |
||
226 |
The last triple is a dummy, (len(a), len(b), 0), and is the only
|
|
227 |
triple with n==0.
|
|
228 |
||
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
229 |
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
230 |
>>> s.get_matching_blocks()
|
231 |
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
|
|
232 |
"""
|
|
233 |
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
|
|
234 |
# implementation which uses __helper. 2.4.3 got rid of helper for
|
|
235 |
# doing it inline with a queue.
|
|
236 |
# We should consider doing the same for recurse_matches
|
|
237 |
||
238 |
if self.matching_blocks is not None: |
|
239 |
return self.matching_blocks |
|
240 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
241 |
matches = [] |
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
242 |
recurse_matches_py(self.a, self.b, 0, 0, |
243 |
len(self.a), len(self.b), matches, 10) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
244 |
# Matches now has individual line pairs of
|
245 |
# line A matches line B, at the given offsets
|
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
246 |
self.matching_blocks = _collapse_sequences(matches) |
247 |
self.matching_blocks.append( (len(self.a), len(self.b), 0) ) |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
248 |
if PatienceSequenceMatcher_py._do_check_consistency: |
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
249 |
if __debug__: |
250 |
_check_consistency(self.matching_blocks) |
|
251 |
||
252 |
return self.matching_blocks |