1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
1 |
#!/usr/bin/env python
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
2 |
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
|
3 |
#
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
4 |
# This program is free software; you can redistribute it and/or modify
|
5 |
# it under the terms of the GNU General Public License as published by
|
|
6 |
# the Free Software Foundation; either version 2 of the License, or
|
|
7 |
# (at your option) any later version.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
8 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
9 |
# This program is distributed in the hope that it will be useful,
|
10 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 |
# GNU General Public License for more details.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
13 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
14 |
# You should have received a copy of the GNU General Public License
|
15 |
# along with this program; if not, write to the Free Software
|
|
4183.7.1
by Sabin Iacob
update FSF mailing address |
16 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
17 |
|
6379.6.3
by Jelmer Vernooij
Use absolute_import. |
18 |
from __future__ import absolute_import |
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
19 |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
20 |
from bisect import bisect |
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
21 |
import difflib |
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
22 |
|
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
23 |
from bzrlib.trace import mutter |
24 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
25 |
|
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
26 |
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files'] |
1185.81.9
by John Arbash Meinel
Added (failing) tests for cdv.recurse_matches with common sections, |
27 |
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
28 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
29 |
def unique_lcs_py(a, b): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
30 |
"""Find the longest common subset for unique lines.
|
31 |
||
32 |
:param a: An indexable object (such as string or list of strings)
|
|
33 |
:param b: Another indexable object (such as string or list of strings)
|
|
34 |
:return: A list of tuples, one for each line which is matched.
|
|
35 |
[(line_in_a, line_in_b), ...]
|
|
36 |
||
37 |
This only matches lines which are unique on both sides.
|
|
38 |
This helps prevent common lines from over influencing match
|
|
39 |
results.
|
|
40 |
The longest common subset uses the Patience Sorting algorithm:
|
|
41 |
http://en.wikipedia.org/wiki/Patience_sorting
|
|
42 |
"""
|
|
43 |
# set index[line in a] = position of line in a unless
|
|
2100.2.1
by wang
Replace python's difflib by patiencediff because the worst case |
44 |
# a is a duplicate, in which case it's set to None
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
45 |
index = {} |
46 |
for i in xrange(len(a)): |
|
47 |
line = a[i] |
|
48 |
if line in index: |
|
49 |
index[line] = None |
|
50 |
else: |
|
51 |
index[line]= i |
|
52 |
# make btoa[i] = position of line i in a, unless
|
|
3943.8.1
by Marius Kruger
remove all trailing whitespace from bzr source |
53 |
# that line doesn't occur exactly once in both,
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
54 |
# in which case it's set to None
|
55 |
btoa = [None] * len(b) |
|
56 |
index2 = {} |
|
57 |
for pos, line in enumerate(b): |
|
58 |
next = index.get(line) |
|
59 |
if next is not None: |
|
60 |
if line in index2: |
|
61 |
# unset the previous mapping, which we now know to
|
|
62 |
# be invalid because the line isn't unique
|
|
63 |
btoa[index2[line]] = None |
|
64 |
del index[line] |
|
65 |
else: |
|
66 |
index2[line] = pos |
|
67 |
btoa[pos] = next |
|
68 |
# this is the Patience sorting algorithm
|
|
69 |
# see http://en.wikipedia.org/wiki/Patience_sorting
|
|
70 |
backpointers = [None] * len(b) |
|
71 |
stacks = [] |
|
72 |
lasts = [] |
|
73 |
k = 0 |
|
74 |
for bpos, apos in enumerate(btoa): |
|
75 |
if apos is None: |
|
76 |
continue
|
|
77 |
# as an optimization, check if the next line comes at the end,
|
|
78 |
# because it usually does
|
|
79 |
if stacks and stacks[-1] < apos: |
|
80 |
k = len(stacks) |
|
81 |
# as an optimization, check if the next line comes right after
|
|
82 |
# the previous line, because usually it does
|
|
3943.8.1
by Marius Kruger
remove all trailing whitespace from bzr source |
83 |
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or |
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
84 |
stacks[k+1] > apos): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
85 |
k += 1 |
86 |
else: |
|
87 |
k = bisect(stacks, apos) |
|
88 |
if k > 0: |
|
89 |
backpointers[bpos] = lasts[k-1] |
|
90 |
if k < len(stacks): |
|
91 |
stacks[k] = apos |
|
92 |
lasts[k] = bpos |
|
93 |
else: |
|
94 |
stacks.append(apos) |
|
95 |
lasts.append(bpos) |
|
96 |
if len(lasts) == 0: |
|
97 |
return [] |
|
98 |
result = [] |
|
99 |
k = lasts[-1] |
|
100 |
while k is not None: |
|
101 |
result.append((btoa[k], k)) |
|
102 |
k = backpointers[k] |
|
103 |
result.reverse() |
|
104 |
return result |
|
105 |
||
106 |
||
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
107 |
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
108 |
"""Find all of the matching text in the lines of a and b.
|
109 |
||
110 |
:param a: A sequence
|
|
111 |
:param b: Another sequence
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
112 |
:param alo: The start location of a to check, typically 0
|
113 |
:param ahi: The start location of b to check, typically 0
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
114 |
:param ahi: The maximum length of a to check, typically len(a)
|
115 |
:param bhi: The maximum length of b to check, typically len(b)
|
|
116 |
:param answer: The return array. Will be filled with tuples
|
|
1711.2.17
by John Arbash Meinel
Small cleanups to patience_diff code. |
117 |
indicating [(line_in_a, line_in_b)]
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
118 |
:param maxrecursion: The maximum depth to recurse.
|
119 |
Must be a positive integer.
|
|
120 |
:return: None, the return value is in the parameter answer, which
|
|
121 |
should be a list
|
|
122 |
||
123 |
"""
|
|
124 |
if maxrecursion < 0: |
|
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
125 |
mutter('max recursion depth reached') |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
126 |
# this will never happen normally, this check is to prevent DOS attacks
|
127 |
return
|
|
128 |
oldlength = len(answer) |
|
129 |
if alo == ahi or blo == bhi: |
|
130 |
return
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
131 |
last_a_pos = alo-1 |
132 |
last_b_pos = blo-1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
133 |
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
134 |
# recurse between lines which are unique in each file and match
|
135 |
apos += alo |
|
136 |
bpos += blo |
|
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
137 |
# Most of the time, you will have a sequence of similar entries
|
138 |
if last_a_pos+1 != apos or last_b_pos+1 != bpos: |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
139 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
140 |
apos, bpos, answer, maxrecursion - 1) |
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
141 |
last_a_pos = apos |
142 |
last_b_pos = bpos |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
143 |
answer.append((apos, bpos)) |
144 |
if len(answer) > oldlength: |
|
145 |
# find matches between the last match and the end
|
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
146 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
147 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
148 |
elif a[alo] == b[blo]: |
149 |
# find matching lines at the very beginning
|
|
150 |
while alo < ahi and blo < bhi and a[alo] == b[blo]: |
|
151 |
answer.append((alo, blo)) |
|
152 |
alo += 1 |
|
153 |
blo += 1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
154 |
recurse_matches_py(a, b, alo, blo, |
155 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
156 |
elif a[ahi - 1] == b[bhi - 1]: |
157 |
# find matching lines at the very end
|
|
158 |
nahi = ahi - 1 |
|
159 |
nbhi = bhi - 1 |
|
160 |
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]: |
|
161 |
nahi -= 1 |
|
162 |
nbhi -= 1 |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
163 |
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1, |
164 |
nahi, nbhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
165 |
for i in xrange(ahi - nahi): |
166 |
answer.append((nahi + i, nbhi + i)) |
|
167 |
||
168 |
||
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
169 |
def _collapse_sequences(matches): |
170 |
"""Find sequences of lines.
|
|
171 |
||
172 |
Given a sequence of [(line_in_a, line_in_b),]
|
|
173 |
find regions where they both increment at the same time
|
|
174 |
"""
|
|
175 |
answer = [] |
|
176 |
start_a = start_b = None |
|
177 |
length = 0 |
|
178 |
for i_a, i_b in matches: |
|
179 |
if (start_a is not None |
|
3565.3.1
by Robert Collins
* The generic fetch code now uses two attributes on Repository objects |
180 |
and (i_a == start_a + length) |
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
181 |
and (i_b == start_b + length)): |
182 |
length += 1 |
|
183 |
else: |
|
184 |
if start_a is not None: |
|
185 |
answer.append((start_a, start_b, length)) |
|
186 |
start_a = i_a |
|
187 |
start_b = i_b |
|
188 |
length = 1 |
|
189 |
||
190 |
if length != 0: |
|
191 |
answer.append((start_a, start_b, length)) |
|
192 |
||
193 |
return answer |
|
194 |
||
195 |
||
196 |
def _check_consistency(answer): |
|
197 |
# For consistency sake, make sure all matches are only increasing
|
|
198 |
next_a = -1 |
|
199 |
next_b = -1 |
|
3376.2.8
by Martin Pool
Some review cleanups for assertion removal |
200 |
for (a, b, match_len) in answer: |
3376.2.4
by Martin Pool
Remove every assert statement from bzrlib! |
201 |
if a < next_a: |
202 |
raise ValueError('Non increasing matches for a') |
|
203 |
if b < next_b: |
|
3376.2.8
by Martin Pool
Some review cleanups for assertion removal |
204 |
raise ValueError('Non increasing matches for b') |
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
205 |
next_a = a + match_len |
206 |
next_b = b + match_len |
|
207 |
||
208 |
||
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
209 |
class PatienceSequenceMatcher_py(difflib.SequenceMatcher): |
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
210 |
"""Compare a pair of sequences using longest common subset."""
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
211 |
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
212 |
_do_check_consistency = True |
213 |
||
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
214 |
def __init__(self, isjunk=None, a='', b=''): |
215 |
if isjunk is not None: |
|
216 |
raise NotImplementedError('Currently we do not support' |
|
217 |
' isjunk for sequence matching') |
|
218 |
difflib.SequenceMatcher.__init__(self, isjunk, a, b) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
219 |
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
220 |
def get_matching_blocks(self): |
221 |
"""Return list of triples describing matching subsequences.
|
|
222 |
||
223 |
Each triple is of the form (i, j, n), and means that
|
|
224 |
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
|
|
225 |
i and in j.
|
|
226 |
||
227 |
The last triple is a dummy, (len(a), len(b), 0), and is the only
|
|
228 |
triple with n==0.
|
|
229 |
||
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
230 |
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
231 |
>>> s.get_matching_blocks()
|
232 |
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
|
|
233 |
"""
|
|
3943.8.1
by Marius Kruger
remove all trailing whitespace from bzr source |
234 |
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
235 |
# implementation which uses __helper. 2.4.3 got rid of helper for
|
236 |
# doing it inline with a queue.
|
|
237 |
# We should consider doing the same for recurse_matches
|
|
238 |
||
239 |
if self.matching_blocks is not None: |
|
240 |
return self.matching_blocks |
|
241 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
242 |
matches = [] |
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
243 |
recurse_matches_py(self.a, self.b, 0, 0, |
244 |
len(self.a), len(self.b), matches, 10) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
245 |
# Matches now has individual line pairs of
|
246 |
# line A matches line B, at the given offsets
|
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
247 |
self.matching_blocks = _collapse_sequences(matches) |
248 |
self.matching_blocks.append( (len(self.a), len(self.b), 0) ) |
|
2781.1.1
by Martin Pool
merge cpatiencediff from Lukas |
249 |
if PatienceSequenceMatcher_py._do_check_consistency: |
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
250 |
if __debug__: |
251 |
_check_consistency(self.matching_blocks) |
|
252 |
||
253 |
return self.matching_blocks |