2
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
from bisect import bisect
25
from bzrlib.trace import mutter
28
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
32
"""Find the longest common subset for unique lines.
34
:param a: An indexable object (such as string or list of strings)
35
:param b: Another indexable object (such as string or list of strings)
36
:return: A list of tuples, one for each line which is matched.
37
[(line_in_a, line_in_b), ...]
39
This only matches lines which are unique on both sides.
40
This helps prevent common lines from over influencing match
42
The longest common subset uses the Patience Sorting algorithm:
43
http://en.wikipedia.org/wiki/Patience_sorting
45
# set index[line in a] = position of line in a unless
46
# unless a is a duplicate, in which case it's set to None
48
for i in xrange(len(a)):
54
# make btoa[i] = position of line i in a, unless
55
# that line doesn't occur exactly once in both,
56
# in which case it's set to None
57
btoa = [None] * len(b)
59
for pos, line in enumerate(b):
60
next = index.get(line)
63
# unset the previous mapping, which we now know to
64
# be invalid because the line isn't unique
65
btoa[index2[line]] = None
70
# this is the Patience sorting algorithm
71
# see http://en.wikipedia.org/wiki/Patience_sorting
72
backpointers = [None] * len(b)
76
for bpos, apos in enumerate(btoa):
79
# as an optimization, check if the next line comes at the end,
80
# because it usually does
81
if stacks and stacks[-1] < apos:
83
# as an optimization, check if the next line comes right after
84
# the previous line, because usually it does
85
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
89
k = bisect(stacks, apos)
91
backpointers[bpos] = lasts[k-1]
103
result.append((btoa[k], k))
109
def recurse_matches(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
110
"""Find all of the matching text in the lines of a and b.
113
:param b: Another sequence
114
:param alo: The start location of a to check, typically 0
115
:param ahi: The start location of b to check, typically 0
116
:param ahi: The maximum length of a to check, typically len(a)
117
:param bhi: The maximum length of b to check, typically len(b)
118
:param answer: The return array. Will be filled with tuples
119
indicating [(line_in_a, line_in_b)]
120
:param maxrecursion: The maximum depth to recurse.
121
Must be a positive integer.
122
:return: None, the return value is in the parameter answer, which
127
mutter('max recursion depth reached')
128
# this will never happen normally, this check is to prevent DOS attacks
130
oldlength = len(answer)
131
if alo == ahi or blo == bhi:
135
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
136
# recurse between lines which are unique in each file and match
139
# Most of the time, you will have a sequence of similar entries
140
if last_a_pos+1 != apos or last_b_pos+1 != bpos:
141
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
142
apos, bpos, answer, maxrecursion - 1)
145
answer.append((apos, bpos))
146
if len(answer) > oldlength:
147
# find matches between the last match and the end
148
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
149
ahi, bhi, answer, maxrecursion - 1)
150
elif a[alo] == b[blo]:
151
# find matching lines at the very beginning
152
while alo < ahi and blo < bhi and a[alo] == b[blo]:
153
answer.append((alo, blo))
156
recurse_matches(a, b, alo, blo,
157
ahi, bhi, answer, maxrecursion - 1)
158
elif a[ahi - 1] == b[bhi - 1]:
159
# find matching lines at the very end
162
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
165
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
166
nahi, nbhi, answer, maxrecursion - 1)
167
for i in xrange(ahi - nahi):
168
answer.append((nahi + i, nbhi + i))
171
def _collapse_sequences(matches):
172
"""Find sequences of lines.
174
Given a sequence of [(line_in_a, line_in_b),]
175
find regions where they both increment at the same time
178
start_a = start_b = None
180
for i_a, i_b in matches:
181
if (start_a is not None
182
and (i_a == start_a + length)
183
and (i_b == start_b + length)):
186
if start_a is not None:
187
answer.append((start_a, start_b, length))
193
answer.append((start_a, start_b, length))
198
def _check_consistency(answer):
199
# For consistency sake, make sure all matches are only increasing
202
for a,b,match_len in answer:
203
assert a >= next_a, 'Non increasing matches for a'
204
assert b >= next_b, 'Not increasing matches for b'
205
next_a = a + match_len
206
next_b = b + match_len
209
class PatienceSequenceMatcher(difflib.SequenceMatcher):
210
"""Compare a pair of sequences using longest common subset."""
212
_do_check_consistency = True
214
def __init__(self, isjunk=None, a='', b=''):
215
if isjunk is not None:
216
raise NotImplementedError('Currently we do not support'
217
' isjunk for sequence matching')
218
difflib.SequenceMatcher.__init__(self, isjunk, a, b)
220
def get_matching_blocks(self):
221
"""Return list of triples describing matching subsequences.
223
Each triple is of the form (i, j, n), and means that
224
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
227
The last triple is a dummy, (len(a), len(b), 0), and is the only
230
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
231
>>> s.get_matching_blocks()
232
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
234
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
235
# implementation which uses __helper. 2.4.3 got rid of helper for
236
# doing it inline with a queue.
237
# We should consider doing the same for recurse_matches
239
if self.matching_blocks is not None:
240
return self.matching_blocks
243
recurse_matches(self.a, self.b, 0, 0,
244
len(self.a), len(self.b), matches, 10)
245
# Matches now has individual line pairs of
246
# line A matches line B, at the given offsets
247
self.matching_blocks = _collapse_sequences(matches)
248
self.matching_blocks.append( (len(self.a), len(self.b), 0) )
249
if PatienceSequenceMatcher._do_check_consistency:
251
_check_consistency(self.matching_blocks)
253
return self.matching_blocks
256
# This is a version of unified_diff which only adds a factory parameter
257
# so that you can override the default SequenceMatcher
258
# this has been submitted as a patch to python
259
def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
260
tofiledate='', n=3, lineterm='\n',
261
sequencematcher=None):
263
Compare two sequences of lines; generate the delta as a unified diff.
265
Unified diffs are a compact way of showing line changes and a few
266
lines of context. The number of context lines is set by 'n' which
269
By default, the diff control lines (those with ---, +++, or @@) are
270
created with a trailing newline. This is helpful so that inputs
271
created from file.readlines() result in diffs that are suitable for
272
file.writelines() since both the inputs and outputs have trailing
275
For inputs that do not have trailing newlines, set the lineterm
276
argument to "" so that the output will be uniformly newline free.
278
The unidiff format normally has a header for filenames and modification
279
times. Any or all of these may be specified using strings for
280
'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
281
times are normally expressed in the format returned by time.ctime().
285
>>> for line in unified_diff('one two three four'.split(),
286
... 'zero one tree four'.split(), 'Original', 'Current',
287
... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
290
--- Original Sat Jan 26 23:30:50 1991
291
+++ Current Fri Jun 06 10:20:52 2003
300
if sequencematcher is None:
301
sequencematcher = difflib.SequenceMatcher
304
for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
306
yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
307
yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
309
i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
310
yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
311
for tag, i1, i2, j1, j2 in group:
313
for line in a[i1:i2]:
316
if tag == 'replace' or tag == 'delete':
317
for line in a[i1:i2]:
319
if tag == 'replace' or tag == 'insert':
320
for line in b[j1:j2]:
324
def unified_diff_files(a, b, sequencematcher=None):
325
"""Generate the diff for two files.
327
# Should this actually be an error?
334
file_a = open(a, 'rb')
335
time_a = os.stat(a).st_mtime
341
file_b = open(b, 'rb')
342
time_b = os.stat(b).st_mtime
344
# TODO: Include fromfiledate and tofiledate
345
return unified_diff(file_a.readlines(), file_b.readlines(),
346
fromfile=a, tofile=b,
347
sequencematcher=sequencematcher)
352
p = optparse.OptionParser(usage='%prog [options] file_a file_b'
353
'\nFiles can be "-" to read from stdin')
354
p.add_option('--patience', dest='matcher', action='store_const', const='patience',
355
default='patience', help='Use the patience difference algorithm')
356
p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
357
default='patience', help='Use python\'s difflib algorithm')
359
algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher}
361
(opts, args) = p.parse_args(args)
362
matcher = algorithms[opts.matcher]
365
print 'You must supply 2 filenames to diff'
368
for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
369
sys.stdout.write(line)
371
if __name__ == '__main__':
372
sys.exit(main(sys.argv[1:]))