1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
1 |
#!/usr/bin/env python
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
2 |
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
|
3 |
#
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
4 |
# This program is free software; you can redistribute it and/or modify
|
5 |
# it under the terms of the GNU General Public License as published by
|
|
6 |
# the Free Software Foundation; either version 2 of the License, or
|
|
7 |
# (at your option) any later version.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
8 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
9 |
# This program is distributed in the hope that it will be useful,
|
10 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 |
# GNU General Public License for more details.
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
13 |
#
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
14 |
# You should have received a copy of the GNU General Public License
|
15 |
# along with this program; if not, write to the Free Software
|
|
16 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
17 |
||
18 |
||
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
19 |
from bisect import bisect |
20 |
from copy import copy |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
21 |
import difflib |
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
22 |
import os |
23 |
import sys |
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
24 |
import time |
25 |
||
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
26 |
from bzrlib.trace import mutter |
27 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
28 |
|
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
29 |
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files'] |
1185.81.9
by John Arbash Meinel
Added (failing) tests for cdv.recurse_matches with common sections, |
30 |
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
31 |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
32 |
def unique_lcs(a, b): |
33 |
"""Find the longest common subset for unique lines.
|
|
34 |
||
35 |
:param a: An indexable object (such as string or list of strings)
|
|
36 |
:param b: Another indexable object (such as string or list of strings)
|
|
37 |
:return: A list of tuples, one for each line which is matched.
|
|
38 |
[(line_in_a, line_in_b), ...]
|
|
39 |
||
40 |
This only matches lines which are unique on both sides.
|
|
41 |
This helps prevent common lines from over influencing match
|
|
42 |
results.
|
|
43 |
The longest common subset uses the Patience Sorting algorithm:
|
|
44 |
http://en.wikipedia.org/wiki/Patience_sorting
|
|
45 |
"""
|
|
46 |
# set index[line in a] = position of line in a unless
|
|
47 |
# unless a is a duplicate, in which case it's set to None
|
|
48 |
index = {} |
|
49 |
for i in xrange(len(a)): |
|
50 |
line = a[i] |
|
51 |
if line in index: |
|
52 |
index[line] = None |
|
53 |
else: |
|
54 |
index[line]= i |
|
55 |
# make btoa[i] = position of line i in a, unless
|
|
56 |
# that line doesn't occur exactly once in both,
|
|
57 |
# in which case it's set to None
|
|
58 |
btoa = [None] * len(b) |
|
59 |
index2 = {} |
|
60 |
for pos, line in enumerate(b): |
|
61 |
next = index.get(line) |
|
62 |
if next is not None: |
|
63 |
if line in index2: |
|
64 |
# unset the previous mapping, which we now know to
|
|
65 |
# be invalid because the line isn't unique
|
|
66 |
btoa[index2[line]] = None |
|
67 |
del index[line] |
|
68 |
else: |
|
69 |
index2[line] = pos |
|
70 |
btoa[pos] = next |
|
71 |
# this is the Patience sorting algorithm
|
|
72 |
# see http://en.wikipedia.org/wiki/Patience_sorting
|
|
73 |
backpointers = [None] * len(b) |
|
74 |
stacks = [] |
|
75 |
lasts = [] |
|
76 |
k = 0 |
|
77 |
for bpos, apos in enumerate(btoa): |
|
78 |
if apos is None: |
|
79 |
continue
|
|
80 |
# as an optimization, check if the next line comes at the end,
|
|
81 |
# because it usually does
|
|
82 |
if stacks and stacks[-1] < apos: |
|
83 |
k = len(stacks) |
|
84 |
# as an optimization, check if the next line comes right after
|
|
85 |
# the previous line, because usually it does
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
86 |
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or |
87 |
stacks[k+1] > apos): |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
88 |
k += 1 |
89 |
else: |
|
90 |
k = bisect(stacks, apos) |
|
91 |
if k > 0: |
|
92 |
backpointers[bpos] = lasts[k-1] |
|
93 |
if k < len(stacks): |
|
94 |
stacks[k] = apos |
|
95 |
lasts[k] = bpos |
|
96 |
else: |
|
97 |
stacks.append(apos) |
|
98 |
lasts.append(bpos) |
|
99 |
if len(lasts) == 0: |
|
100 |
return [] |
|
101 |
result = [] |
|
102 |
k = lasts[-1] |
|
103 |
while k is not None: |
|
104 |
result.append((btoa[k], k)) |
|
105 |
k = backpointers[k] |
|
106 |
result.reverse() |
|
107 |
return result |
|
108 |
||
109 |
||
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
110 |
def recurse_matches(a, b, alo, blo, ahi, bhi, answer, maxrecursion): |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
111 |
"""Find all of the matching text in the lines of a and b.
|
112 |
||
113 |
:param a: A sequence
|
|
114 |
:param b: Another sequence
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
115 |
:param alo: The start location of a to check, typically 0
|
116 |
:param ahi: The start location of b to check, typically 0
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
117 |
:param ahi: The maximum length of a to check, typically len(a)
|
118 |
:param bhi: The maximum length of b to check, typically len(b)
|
|
119 |
:param answer: The return array. Will be filled with tuples
|
|
1711.2.17
by John Arbash Meinel
Small cleanups to patience_diff code. |
120 |
indicating [(line_in_a, line_in_b)]
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
121 |
:param maxrecursion: The maximum depth to recurse.
|
122 |
Must be a positive integer.
|
|
123 |
:return: None, the return value is in the parameter answer, which
|
|
124 |
should be a list
|
|
125 |
||
126 |
"""
|
|
127 |
if maxrecursion < 0: |
|
1711.2.12
by John Arbash Meinel
Make a mention when the maximum recursion length is reached. |
128 |
mutter('max recursion depth reached') |
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
129 |
# this will never happen normally, this check is to prevent DOS attacks
|
130 |
return
|
|
131 |
oldlength = len(answer) |
|
132 |
if alo == ahi or blo == bhi: |
|
133 |
return
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
134 |
last_a_pos = alo-1 |
135 |
last_b_pos = blo-1 |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
136 |
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]): |
137 |
# recurse between lines which are unique in each file and match
|
|
138 |
apos += alo |
|
139 |
bpos += blo |
|
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
140 |
# Most of the time, you will have a sequence of similar entries
|
141 |
if last_a_pos+1 != apos or last_b_pos+1 != bpos: |
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
142 |
recurse_matches(a, b, last_a_pos+1, last_b_pos+1, |
143 |
apos, bpos, answer, maxrecursion - 1) |
|
1711.2.18
by John Arbash Meinel
Optimize common case where unique_lcs returns a set of lines all in a row |
144 |
last_a_pos = apos |
145 |
last_b_pos = bpos |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
146 |
answer.append((apos, bpos)) |
147 |
if len(answer) > oldlength: |
|
148 |
# find matches between the last match and the end
|
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
149 |
recurse_matches(a, b, last_a_pos+1, last_b_pos+1, |
150 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
151 |
elif a[alo] == b[blo]: |
152 |
# find matching lines at the very beginning
|
|
153 |
while alo < ahi and blo < bhi and a[alo] == b[blo]: |
|
154 |
answer.append((alo, blo)) |
|
155 |
alo += 1 |
|
156 |
blo += 1 |
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
157 |
recurse_matches(a, b, alo, blo, |
158 |
ahi, bhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
159 |
elif a[ahi - 1] == b[bhi - 1]: |
160 |
# find matching lines at the very end
|
|
161 |
nahi = ahi - 1 |
|
162 |
nbhi = bhi - 1 |
|
163 |
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]: |
|
164 |
nahi -= 1 |
|
165 |
nbhi -= 1 |
|
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
166 |
recurse_matches(a, b, last_a_pos+1, last_b_pos+1, |
167 |
nahi, nbhi, answer, maxrecursion - 1) |
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
168 |
for i in xrange(ahi - nahi): |
169 |
answer.append((nahi + i, nbhi + i)) |
|
170 |
||
171 |
||
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
172 |
def _collapse_sequences(matches): |
173 |
"""Find sequences of lines.
|
|
174 |
||
175 |
Given a sequence of [(line_in_a, line_in_b),]
|
|
176 |
find regions where they both increment at the same time
|
|
177 |
"""
|
|
178 |
answer = [] |
|
179 |
start_a = start_b = None |
|
180 |
length = 0 |
|
181 |
for i_a, i_b in matches: |
|
182 |
if (start_a is not None |
|
183 |
and (i_a == start_a + length) |
|
184 |
and (i_b == start_b + length)): |
|
185 |
length += 1 |
|
186 |
else: |
|
187 |
if start_a is not None: |
|
188 |
answer.append((start_a, start_b, length)) |
|
189 |
start_a = i_a |
|
190 |
start_b = i_b |
|
191 |
length = 1 |
|
192 |
||
193 |
if length != 0: |
|
194 |
answer.append((start_a, start_b, length)) |
|
195 |
||
196 |
return answer |
|
197 |
||
198 |
||
199 |
def _check_consistency(answer): |
|
200 |
# For consistency sake, make sure all matches are only increasing
|
|
201 |
next_a = -1 |
|
202 |
next_b = -1 |
|
203 |
for a,b,match_len in answer: |
|
204 |
assert a >= next_a, 'Non increasing matches for a' |
|
205 |
assert b >= next_b, 'Not increasing matches for b' |
|
206 |
next_a = a + match_len |
|
207 |
next_b = b + match_len |
|
208 |
||
209 |
||
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
210 |
class PatienceSequenceMatcher(difflib.SequenceMatcher): |
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
211 |
"""Compare a pair of sequences using longest common subset."""
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
212 |
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
213 |
_do_check_consistency = True |
214 |
||
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
215 |
def __init__(self, isjunk=None, a='', b=''): |
216 |
if isjunk is not None: |
|
217 |
raise NotImplementedError('Currently we do not support' |
|
218 |
' isjunk for sequence matching') |
|
219 |
difflib.SequenceMatcher.__init__(self, isjunk, a, b) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
220 |
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
221 |
def get_matching_blocks(self): |
222 |
"""Return list of triples describing matching subsequences.
|
|
223 |
||
224 |
Each triple is of the form (i, j, n), and means that
|
|
225 |
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
|
|
226 |
i and in j.
|
|
227 |
||
228 |
The last triple is a dummy, (len(a), len(b), 0), and is the only
|
|
229 |
triple with n==0.
|
|
230 |
||
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
231 |
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
|
1711.2.7
by John Arbash Meinel
Override get_matching_blocks |
232 |
>>> s.get_matching_blocks()
|
233 |
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
|
|
234 |
"""
|
|
235 |
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
|
|
236 |
# implementation which uses __helper. 2.4.3 got rid of helper for
|
|
237 |
# doing it inline with a queue.
|
|
238 |
# We should consider doing the same for recurse_matches
|
|
239 |
||
240 |
if self.matching_blocks is not None: |
|
241 |
return self.matching_blocks |
|
242 |
||
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
243 |
matches = [] |
1711.2.22
by John Arbash Meinel
Passing the alo parameter to recurse_matches shaves of 5% of the diff time. |
244 |
recurse_matches(self.a, self.b, 0, 0, |
245 |
len(self.a), len(self.b), matches, 10) |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
246 |
# Matches now has individual line pairs of
|
247 |
# line A matches line B, at the given offsets
|
|
1711.2.21
by John Arbash Meinel
Cleanup patiencediff, remove the use of difflib.SequenceMatcher. |
248 |
self.matching_blocks = _collapse_sequences(matches) |
249 |
self.matching_blocks.append( (len(self.a), len(self.b), 0) ) |
|
250 |
if PatienceSequenceMatcher._do_check_consistency: |
|
251 |
if __debug__: |
|
252 |
_check_consistency(self.matching_blocks) |
|
253 |
||
254 |
return self.matching_blocks |
|
1185.81.16
by John Arbash Meinel
Added tests, and an assert check to make sure ranges are always increasing. |
255 |
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
256 |
|
1185.81.8
by John Arbash Meinel
Updating unified_diff to take a factory, using the new diff algorithm in the code. |
257 |
# This is a version of unified_diff which only adds a factory parameter
|
258 |
# so that you can override the default SequenceMatcher
|
|
259 |
# this has been submitted as a patch to python
|
|
260 |
def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', |
|
261 |
tofiledate='', n=3, lineterm='\n', |
|
262 |
sequencematcher=None): |
|
263 |
r""" |
|
264 |
Compare two sequences of lines; generate the delta as a unified diff.
|
|
265 |
||
266 |
Unified diffs are a compact way of showing line changes and a few
|
|
267 |
lines of context. The number of context lines is set by 'n' which
|
|
268 |
defaults to three.
|
|
269 |
||
270 |
By default, the diff control lines (those with ---, +++, or @@) are
|
|
271 |
created with a trailing newline. This is helpful so that inputs
|
|
272 |
created from file.readlines() result in diffs that are suitable for
|
|
273 |
file.writelines() since both the inputs and outputs have trailing
|
|
274 |
newlines.
|
|
275 |
||
276 |
For inputs that do not have trailing newlines, set the lineterm
|
|
277 |
argument to "" so that the output will be uniformly newline free.
|
|
278 |
||
279 |
The unidiff format normally has a header for filenames and modification
|
|
280 |
times. Any or all of these may be specified using strings for
|
|
281 |
'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
|
|
282 |
times are normally expressed in the format returned by time.ctime().
|
|
283 |
||
284 |
Example:
|
|
285 |
||
286 |
>>> for line in unified_diff('one two three four'.split(),
|
|
287 |
... 'zero one tree four'.split(), 'Original', 'Current',
|
|
288 |
... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
|
|
289 |
... lineterm=''):
|
|
290 |
... print line
|
|
291 |
--- Original Sat Jan 26 23:30:50 1991
|
|
292 |
+++ Current Fri Jun 06 10:20:52 2003
|
|
293 |
@@ -1,4 +1,4 @@
|
|
294 |
+zero
|
|
295 |
one
|
|
296 |
-two
|
|
297 |
-three
|
|
298 |
+tree
|
|
299 |
four
|
|
300 |
"""
|
|
301 |
if sequencematcher is None: |
|
302 |
sequencematcher = difflib.SequenceMatcher |
|
303 |
||
304 |
started = False |
|
305 |
for group in sequencematcher(None,a,b).get_grouped_opcodes(n): |
|
306 |
if not started: |
|
307 |
yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm) |
|
308 |
yield '+++ %s %s%s' % (tofile, tofiledate, lineterm) |
|
309 |
started = True |
|
310 |
i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] |
|
311 |
yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) |
|
312 |
for tag, i1, i2, j1, j2 in group: |
|
313 |
if tag == 'equal': |
|
314 |
for line in a[i1:i2]: |
|
315 |
yield ' ' + line |
|
316 |
continue
|
|
317 |
if tag == 'replace' or tag == 'delete': |
|
318 |
for line in a[i1:i2]: |
|
319 |
yield '-' + line |
|
320 |
if tag == 'replace' or tag == 'insert': |
|
321 |
for line in b[j1:j2]: |
|
322 |
yield '+' + line |
|
323 |
||
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
324 |
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
325 |
def unified_diff_files(a, b, sequencematcher=None): |
326 |
"""Generate the diff for two files.
|
|
327 |
"""
|
|
328 |
# Should this actually be an error?
|
|
329 |
if a == b: |
|
330 |
return [] |
|
331 |
if a == '-': |
|
332 |
file_a = sys.stdin |
|
333 |
time_a = time.time() |
|
334 |
else: |
|
335 |
file_a = open(a, 'rb') |
|
336 |
time_a = os.stat(a).st_mtime |
|
337 |
||
338 |
if b == '-': |
|
339 |
file_b = sys.stdin |
|
340 |
time_b = time.time() |
|
341 |
else: |
|
342 |
file_b = open(b, 'rb') |
|
343 |
time_b = os.stat(b).st_mtime |
|
344 |
||
345 |
# TODO: Include fromfiledate and tofiledate
|
|
346 |
return unified_diff(file_a.readlines(), file_b.readlines(), |
|
347 |
fromfile=a, tofile=b, |
|
348 |
sequencematcher=sequencematcher) |
|
349 |
||
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
350 |
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
351 |
def main(args): |
352 |
import optparse |
|
353 |
p = optparse.OptionParser(usage='%prog [options] file_a file_b' |
|
354 |
'\nFiles can be "-" to read from stdin') |
|
1711.2.9
by John Arbash Meinel
Rename cdv => patience |
355 |
p.add_option('--patience', dest='matcher', action='store_const', const='patience', |
356 |
default='patience', help='Use the patience difference algorithm') |
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
357 |
p.add_option('--difflib', dest='matcher', action='store_const', const='difflib', |
1711.2.9
by John Arbash Meinel
Rename cdv => patience |
358 |
default='patience', help='Use python\'s difflib algorithm') |
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
359 |
|
1711.2.11
by John Arbash Meinel
Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher |
360 |
algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher} |
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
361 |
|
362 |
(opts, args) = p.parse_args(args) |
|
363 |
matcher = algorithms[opts.matcher] |
|
364 |
||
365 |
if len(args) != 2: |
|
366 |
print 'You must supply 2 filenames to diff' |
|
367 |
return -1 |
|
368 |
||
369 |
for line in unified_diff_files(args[0], args[1], sequencematcher=matcher): |
|
370 |
sys.stdout.write(line) |
|
371 |
||
372 |
if __name__ == '__main__': |
|
373 |
sys.exit(main(sys.argv[1:])) |