~bzr-pqm/bzr/bzr.dev : contents of bzrlib/patiencediff.py at revision 1757.2.11

~bzr-pqm/bzr/bzr.dev : (revision 1757.2.11)

1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	1	#!/usr/bin/env python
1185.81.24 by Aaron Bentley Reoganize patience-related code	2	# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
1185.81.24 by Aaron Bentley Reoganize patience-related code	3	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
1185.81.24 by Aaron Bentley Reoganize patience-related code	8	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
1185.81.24 by Aaron Bentley Reoganize patience-related code	13	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18
1185.81.24 by Aaron Bentley Reoganize patience-related code	19	from bisect import bisect
1185.81.24 by Aaron Bentley Reoganize patience-related code	20	from copy import copy
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	21	import difflib
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	22	import os
	23	import sys
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	24	import time
	25
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	26	from bzrlib.trace import mutter
	27
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	28
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	29	__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
1185.81.9 by John Arbash Meinel Added (failing) tests for cdv.recurse_matches with common sections,	30
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	31
1185.81.24 by Aaron Bentley Reoganize patience-related code	32	def unique_lcs(a, b):
	33	"""Find the longest common subset for unique lines.
	34
	35	:param a: An indexable object (such as string or list of strings)
	36	:param b: Another indexable object (such as string or list of strings)
	37	:return: A list of tuples, one for each line which is matched.
	38	[(line_in_a, line_in_b), ...]
	39
	40	This only matches lines which are unique on both sides.
	41	This helps prevent common lines from over influencing match
	42	results.
	43	The longest common subset uses the Patience Sorting algorithm:
	44	http://en.wikipedia.org/wiki/Patience_sorting
	45	"""
	46	# set index[line in a] = position of line in a unless
	47	# unless a is a duplicate, in which case it's set to None
	48	index = {}
	49	for i in xrange(len(a)):
	50	line = a[i]
	51	if line in index:
	52	index[line] = None
	53	else:
	54	index[line]= i
	55	# make btoa[i] = position of line i in a, unless
	56	# that line doesn't occur exactly once in both,
	57	# in which case it's set to None
	58	btoa = [None] * len(b)
	59	index2 = {}
	60	for pos, line in enumerate(b):
	61	next = index.get(line)
	62	if next is not None:
	63	if line in index2:
	64	# unset the previous mapping, which we now know to
	65	# be invalid because the line isn't unique
	66	btoa[index2[line]] = None
	67	del index[line]
	68	else:
	69	index2[line] = pos
	70	btoa[pos] = next
	71	# this is the Patience sorting algorithm
	72	# see http://en.wikipedia.org/wiki/Patience_sorting
	73	backpointers = [None] * len(b)
	74	stacks = []
	75	lasts = []
	76	k = 0
	77	for bpos, apos in enumerate(btoa):
	78	if apos is None:
	79	continue
	80	# as an optimization, check if the next line comes at the end,
	81	# because it usually does
	82	if stacks and stacks[-1] < apos:
	83	k = len(stacks)
	84	# as an optimization, check if the next line comes right after
	85	# the previous line, because usually it does
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	86	elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
	87	stacks[k+1] > apos):
1185.81.24 by Aaron Bentley Reoganize patience-related code	88	k += 1
	89	else:
	90	k = bisect(stacks, apos)
	91	if k > 0:
	92	backpointers[bpos] = lasts[k-1]
	93	if k < len(stacks):
	94	stacks[k] = apos
	95	lasts[k] = bpos
	96	else:
	97	stacks.append(apos)
	98	lasts.append(bpos)
	99	if len(lasts) == 0:
	100	return []
	101	result = []
	102	k = lasts[-1]
	103	while k is not None:
	104	result.append((btoa[k], k))
	105	k = backpointers[k]
	106	result.reverse()
	107	return result
	108
	109
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	110	def recurse_matches(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
1185.81.24 by Aaron Bentley Reoganize patience-related code	111	"""Find all of the matching text in the lines of a and b.
	112
	113	:param a: A sequence
	114	:param b: Another sequence
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	115	:param alo: The start location of a to check, typically 0
	116	:param ahi: The start location of b to check, typically 0
1185.81.24 by Aaron Bentley Reoganize patience-related code	117	:param ahi: The maximum length of a to check, typically len(a)
	118	:param bhi: The maximum length of b to check, typically len(b)
	119	:param answer: The return array. Will be filled with tuples
1711.2.17 by John Arbash Meinel Small cleanups to patience_diff code.	120	indicating [(line_in_a, line_in_b)]
1185.81.24 by Aaron Bentley Reoganize patience-related code	121	:param maxrecursion: The maximum depth to recurse.
	122	Must be a positive integer.
	123	:return: None, the return value is in the parameter answer, which
	124	should be a list
	125
	126	"""
	127	if maxrecursion < 0:
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	128	mutter('max recursion depth reached')
1185.81.24 by Aaron Bentley Reoganize patience-related code	129	# this will never happen normally, this check is to prevent DOS attacks
	130	return
	131	oldlength = len(answer)
	132	if alo == ahi or blo == bhi:
	133	return
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	134	last_a_pos = alo-1
	135	last_b_pos = blo-1
1185.81.24 by Aaron Bentley Reoganize patience-related code	136	for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
	137	# recurse between lines which are unique in each file and match
	138	apos += alo
	139	bpos += blo
1711.2.18 by John Arbash Meinel Optimize common case where unique_lcs returns a set of lines all in a row	140	# Most of the time, you will have a sequence of similar entries
	141	if last_a_pos+1 != apos or last_b_pos+1 != bpos:
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	142	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	143	apos, bpos, answer, maxrecursion - 1)
1711.2.18 by John Arbash Meinel Optimize common case where unique_lcs returns a set of lines all in a row	144	last_a_pos = apos
	145	last_b_pos = bpos
1185.81.24 by Aaron Bentley Reoganize patience-related code	146	answer.append((apos, bpos))
	147	if len(answer) > oldlength:
	148	# find matches between the last match and the end
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	149	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	150	ahi, bhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	151	elif a[alo] == b[blo]:
	152	# find matching lines at the very beginning
	153	while alo < ahi and blo < bhi and a[alo] == b[blo]:
	154	answer.append((alo, blo))
	155	alo += 1
	156	blo += 1
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	157	recurse_matches(a, b, alo, blo,
	158	ahi, bhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	159	elif a[ahi - 1] == b[bhi - 1]:
	160	# find matching lines at the very end
	161	nahi = ahi - 1
	162	nbhi = bhi - 1
	163	while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
	164	nahi -= 1
	165	nbhi -= 1
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	166	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	167	nahi, nbhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	168	for i in xrange(ahi - nahi):
	169	answer.append((nahi + i, nbhi + i))
	170
	171
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	172	def _collapse_sequences(matches):
	173	"""Find sequences of lines.
	174
	175	Given a sequence of [(line_in_a, line_in_b),]
	176	find regions where they both increment at the same time
	177	"""
	178	answer = []
	179	start_a = start_b = None
	180	length = 0
	181	for i_a, i_b in matches:
	182	if (start_a is not None
	183	and (i_a == start_a + length)
	184	and (i_b == start_b + length)):
	185	length += 1
	186	else:
	187	if start_a is not None:
	188	answer.append((start_a, start_b, length))
	189	start_a = i_a
	190	start_b = i_b
	191	length = 1
	192
	193	if length != 0:
	194	answer.append((start_a, start_b, length))
	195
	196	return answer
	197
	198
	199	def _check_consistency(answer):
	200	# For consistency sake, make sure all matches are only increasing
	201	next_a = -1
	202	next_b = -1
	203	for a,b,match_len in answer:
	204	assert a >= next_a, 'Non increasing matches for a'
	205	assert b >= next_b, 'Not increasing matches for b'
	206	next_a = a + match_len
	207	next_b = b + match_len
	208
	209
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	210	class PatienceSequenceMatcher(difflib.SequenceMatcher):
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	211	"""Compare a pair of sequences using longest common subset."""
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	212
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	213	_do_check_consistency = True
	214
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	215	def __init__(self, isjunk=None, a='', b=''):
	216	if isjunk is not None:
	217	raise NotImplementedError('Currently we do not support'
	218	' isjunk for sequence matching')
	219	difflib.SequenceMatcher.__init__(self, isjunk, a, b)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	220
1711.2.7 by John Arbash Meinel Override get_matching_blocks	221	def get_matching_blocks(self):
	222	"""Return list of triples describing matching subsequences.
	223
	224	Each triple is of the form (i, j, n), and means that
	225	a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
	226	i and in j.
	227
	228	The last triple is a dummy, (len(a), len(b), 0), and is the only
	229	triple with n==0.
	230
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	231	>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
1711.2.7 by John Arbash Meinel Override get_matching_blocks	232	>>> s.get_matching_blocks()
	233	[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
	234	"""
	235	# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
	236	# implementation which uses __helper. 2.4.3 got rid of helper for
	237	# doing it inline with a queue.
	238	# We should consider doing the same for recurse_matches
	239
	240	if self.matching_blocks is not None:
	241	return self.matching_blocks
	242
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	243	matches = []
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	244	recurse_matches(self.a, self.b, 0, 0,
	245	len(self.a), len(self.b), matches, 10)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	246	# Matches now has individual line pairs of
	247	# line A matches line B, at the given offsets
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	248	self.matching_blocks = _collapse_sequences(matches)
	249	self.matching_blocks.append( (len(self.a), len(self.b), 0) )
	250	if PatienceSequenceMatcher._do_check_consistency:
	251	if __debug__:
	252	_check_consistency(self.matching_blocks)
	253
	254	return self.matching_blocks
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	255
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	256
1185.81.8 by John Arbash Meinel Updating unified_diff to take a factory, using the new diff algorithm in the code.	257	# This is a version of unified_diff which only adds a factory parameter
	258	# so that you can override the default SequenceMatcher
	259	# this has been submitted as a patch to python
	260	def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
	261	tofiledate='', n=3, lineterm='\n',
	262	sequencematcher=None):
	263	r"""
	264	Compare two sequences of lines; generate the delta as a unified diff.
	265
	266	Unified diffs are a compact way of showing line changes and a few
	267	lines of context. The number of context lines is set by 'n' which
	268	defaults to three.
	269
	270	By default, the diff control lines (those with ---, +++, or @@) are
	271	created with a trailing newline. This is helpful so that inputs
	272	created from file.readlines() result in diffs that are suitable for
	273	file.writelines() since both the inputs and outputs have trailing
	274	newlines.
	275
	276	For inputs that do not have trailing newlines, set the lineterm
	277	argument to "" so that the output will be uniformly newline free.
	278
	279	The unidiff format normally has a header for filenames and modification
	280	times. Any or all of these may be specified using strings for
	281	'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
	282	times are normally expressed in the format returned by time.ctime().
	283
	284	Example:
	285
	286	>>> for line in unified_diff('one two three four'.split(),
	287	... 'zero one tree four'.split(), 'Original', 'Current',
	288	... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
	289	... lineterm=''):
	290	... print line
	291	--- Original Sat Jan 26 23:30:50 1991
	292	+++ Current Fri Jun 06 10:20:52 2003
	293	@@ -1,4 +1,4 @@
	294	+zero
	295	one
	296	-two
	297	-three
	298	+tree
	299	four
	300	"""
	301	if sequencematcher is None:
	302	sequencematcher = difflib.SequenceMatcher
	303
	304	started = False
	305	for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
	306	if not started:
	307	yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
	308	yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
	309	started = True
	310	i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
	311	yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
	312	for tag, i1, i2, j1, j2 in group:
	313	if tag == 'equal':
	314	for line in a[i1:i2]:
	315	yield ' ' + line
	316	continue
	317	if tag == 'replace' or tag == 'delete':
	318	for line in a[i1:i2]:
	319	yield '-' + line
	320	if tag == 'replace' or tag == 'insert':
321	for line in b[j1:j2]:
322	yield '+' + line
323
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	324
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	325	def unified_diff_files(a, b, sequencematcher=None):
	326	"""Generate the diff for two files.
	327	"""
	328	# Should this actually be an error?
	329	if a == b:
	330	return []
	331	if a == '-':
	332	file_a = sys.stdin
	333	time_a = time.time()
	334	else:
	335	file_a = open(a, 'rb')
	336	time_a = os.stat(a).st_mtime
	337
	338	if b == '-':
	339	file_b = sys.stdin
	340	time_b = time.time()
	341	else:
	342	file_b = open(b, 'rb')
	343	time_b = os.stat(b).st_mtime
	344
	345	# TODO: Include fromfiledate and tofiledate
	346	return unified_diff(file_a.readlines(), file_b.readlines(),
	347	fromfile=a, tofile=b,
	348	sequencematcher=sequencematcher)
	349
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	350
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	351	def main(args):
	352	import optparse
	353	p = optparse.OptionParser(usage='%prog [options] file_a file_b'
	354	'\nFiles can be "-" to read from stdin')
1711.2.9 by John Arbash Meinel Rename cdv => patience	355	p.add_option('--patience', dest='matcher', action='store_const', const='patience',
1711.2.9 by John Arbash Meinel Rename cdv => patience	356	default='patience', help='Use the patience difference algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	357	p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
1711.2.9 by John Arbash Meinel Rename cdv => patience	358	default='patience', help='Use python\'s difflib algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	359
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	360	algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher}
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	361
	362	(opts, args) = p.parse_args(args)
	363	matcher = algorithms[opts.matcher]
	364
	365	if len(args) != 2:
	366	print 'You must supply 2 filenames to diff'
	367	return -1
	368
	369	for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
	370	sys.stdout.write(line)
	371
	372	if __name__ == '__main__':
	373	sys.exit(main(sys.argv[1:]))