2
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
from bisect import bisect
26
from bzrlib.trace import mutter
29
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
33
"""Find the longest common subset for unique lines.
35
:param a: An indexable object (such as string or list of strings)
36
:param b: Another indexable object (such as string or list of strings)
37
:return: A list of tuples, one for each line which is matched.
38
[(line_in_a, line_in_b), ...]
40
This only matches lines which are unique on both sides.
41
This helps prevent common lines from over influencing match
43
The longest common subset uses the Patience Sorting algorithm:
44
http://en.wikipedia.org/wiki/Patience_sorting
46
# set index[line in a] = position of line in a unless
47
# unless a is a duplicate, in which case it's set to None
49
for i in xrange(len(a)):
55
# make btoa[i] = position of line i in a, unless
56
# that line doesn't occur exactly once in both,
57
# in which case it's set to None
58
btoa = [None] * len(b)
60
for pos, line in enumerate(b):
61
next = index.get(line)
64
# unset the previous mapping, which we now know to
65
# be invalid because the line isn't unique
66
btoa[index2[line]] = None
71
# this is the Patience sorting algorithm
72
# see http://en.wikipedia.org/wiki/Patience_sorting
73
backpointers = [None] * len(b)
77
for bpos, apos in enumerate(btoa):
80
# as an optimization, check if the next line comes at the end,
81
# because it usually does
82
if stacks and stacks[-1] < apos:
84
# as an optimization, check if the next line comes right after
85
# the previous line, because usually it does
86
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
90
k = bisect(stacks, apos)
92
backpointers[bpos] = lasts[k-1]
104
result.append((btoa[k], k))
110
def recurse_matches(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
111
"""Find all of the matching text in the lines of a and b.
114
:param b: Another sequence
115
:param alo: The start location of a to check, typically 0
116
:param ahi: The start location of b to check, typically 0
117
:param ahi: The maximum length of a to check, typically len(a)
118
:param bhi: The maximum length of b to check, typically len(b)
119
:param answer: The return array. Will be filled with tuples
120
indicating [(line_in_a, line_in_b)]
121
:param maxrecursion: The maximum depth to recurse.
122
Must be a positive integer.
123
:return: None, the return value is in the parameter answer, which
128
mutter('max recursion depth reached')
129
# this will never happen normally, this check is to prevent DOS attacks
131
oldlength = len(answer)
132
if alo == ahi or blo == bhi:
136
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
137
# recurse between lines which are unique in each file and match
140
# Most of the time, you will have a sequence of similar entries
141
if last_a_pos+1 != apos or last_b_pos+1 != bpos:
142
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
143
apos, bpos, answer, maxrecursion - 1)
146
answer.append((apos, bpos))
147
if len(answer) > oldlength:
148
# find matches between the last match and the end
149
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
150
ahi, bhi, answer, maxrecursion - 1)
151
elif a[alo] == b[blo]:
152
# find matching lines at the very beginning
153
while alo < ahi and blo < bhi and a[alo] == b[blo]:
154
answer.append((alo, blo))
157
recurse_matches(a, b, alo, blo,
158
ahi, bhi, answer, maxrecursion - 1)
159
elif a[ahi - 1] == b[bhi - 1]:
160
# find matching lines at the very end
163
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
166
recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
167
nahi, nbhi, answer, maxrecursion - 1)
168
for i in xrange(ahi - nahi):
169
answer.append((nahi + i, nbhi + i))
172
def _collapse_sequences(matches):
173
"""Find sequences of lines.
175
Given a sequence of [(line_in_a, line_in_b),]
176
find regions where they both increment at the same time
179
start_a = start_b = None
181
for i_a, i_b in matches:
182
if (start_a is not None
183
and (i_a == start_a + length)
184
and (i_b == start_b + length)):
187
if start_a is not None:
188
answer.append((start_a, start_b, length))
194
answer.append((start_a, start_b, length))
199
def _check_consistency(answer):
200
# For consistency sake, make sure all matches are only increasing
203
for a,b,match_len in answer:
204
assert a >= next_a, 'Non increasing matches for a'
205
assert b >= next_b, 'Not increasing matches for b'
206
next_a = a + match_len
207
next_b = b + match_len
210
class PatienceSequenceMatcher(difflib.SequenceMatcher):
211
"""Compare a pair of sequences using longest common subset."""
213
_do_check_consistency = True
215
def __init__(self, isjunk=None, a='', b=''):
216
if isjunk is not None:
217
raise NotImplementedError('Currently we do not support'
218
' isjunk for sequence matching')
219
difflib.SequenceMatcher.__init__(self, isjunk, a, b)
221
def get_matching_blocks(self):
222
"""Return list of triples describing matching subsequences.
224
Each triple is of the form (i, j, n), and means that
225
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
228
The last triple is a dummy, (len(a), len(b), 0), and is the only
231
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
232
>>> s.get_matching_blocks()
233
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
235
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
236
# implementation which uses __helper. 2.4.3 got rid of helper for
237
# doing it inline with a queue.
238
# We should consider doing the same for recurse_matches
240
if self.matching_blocks is not None:
241
return self.matching_blocks
244
recurse_matches(self.a, self.b, 0, 0,
245
len(self.a), len(self.b), matches, 10)
246
# Matches now has individual line pairs of
247
# line A matches line B, at the given offsets
248
self.matching_blocks = _collapse_sequences(matches)
249
self.matching_blocks.append( (len(self.a), len(self.b), 0) )
250
if PatienceSequenceMatcher._do_check_consistency:
252
_check_consistency(self.matching_blocks)
254
return self.matching_blocks
257
# This is a version of unified_diff which only adds a factory parameter
258
# so that you can override the default SequenceMatcher
259
# this has been submitted as a patch to python
260
def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
261
tofiledate='', n=3, lineterm='\n',
262
sequencematcher=None):
264
Compare two sequences of lines; generate the delta as a unified diff.
266
Unified diffs are a compact way of showing line changes and a few
267
lines of context. The number of context lines is set by 'n' which
270
By default, the diff control lines (those with ---, +++, or @@) are
271
created with a trailing newline. This is helpful so that inputs
272
created from file.readlines() result in diffs that are suitable for
273
file.writelines() since both the inputs and outputs have trailing
276
For inputs that do not have trailing newlines, set the lineterm
277
argument to "" so that the output will be uniformly newline free.
279
The unidiff format normally has a header for filenames and modification
280
times. Any or all of these may be specified using strings for
281
'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
282
times are normally expressed in the format returned by time.ctime().
286
>>> for line in unified_diff('one two three four'.split(),
287
... 'zero one tree four'.split(), 'Original', 'Current',
288
... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
291
--- Original Sat Jan 26 23:30:50 1991
292
+++ Current Fri Jun 06 10:20:52 2003
301
if sequencematcher is None:
302
sequencematcher = difflib.SequenceMatcher
305
for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
307
yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
308
yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
310
i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
311
yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
312
for tag, i1, i2, j1, j2 in group:
314
for line in a[i1:i2]:
317
if tag == 'replace' or tag == 'delete':
318
for line in a[i1:i2]:
320
if tag == 'replace' or tag == 'insert':
321
for line in b[j1:j2]:
325
def unified_diff_files(a, b, sequencematcher=None):
326
"""Generate the diff for two files.
328
# Should this actually be an error?
335
file_a = open(a, 'rb')
336
time_a = os.stat(a).st_mtime
342
file_b = open(b, 'rb')
343
time_b = os.stat(b).st_mtime
345
# TODO: Include fromfiledate and tofiledate
346
return unified_diff(file_a.readlines(), file_b.readlines(),
347
fromfile=a, tofile=b,
348
sequencematcher=sequencematcher)
353
p = optparse.OptionParser(usage='%prog [options] file_a file_b'
354
'\nFiles can be "-" to read from stdin')
355
p.add_option('--patience', dest='matcher', action='store_const', const='patience',
356
default='patience', help='Use the patience difference algorithm')
357
p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
358
default='patience', help='Use python\'s difflib algorithm')
360
algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher}
362
(opts, args) = p.parse_args(args)
363
matcher = algorithms[opts.matcher]
366
print 'You must supply 2 filenames to diff'
369
for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
370
sys.stdout.write(line)
372
if __name__ == '__main__':
373
sys.exit(main(sys.argv[1:]))