~bzr-pqm/bzr/bzr.dev : revision 1725.2.6

1

#!/usr/bin/env python

2

3

#

4

# This program is free software; you can redistribute it and/or modify

5

# it under the terms of the GNU General Public License as published by

6

# the Free Software Foundation; either version 2 of the License, or

7

# (at your option) any later version.

8

#

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

13

#

14

# You should have received a copy of the GNU General Public License

15

# along with this program; if not, write to the Free Software

16

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

17

18

19

from bisect import bisect

20

from copy import copy

21

import difflib

22

import os

23

import sys

24

import time

25

26

27

__all__ = ['SequenceMatcher', 'unified_diff', 'unified_diff_files']

28

29

30

def unique_lcs(a, b):

31

"""Find the longest common subset for unique lines.

32

33

:param a: An indexable object (such as string or list of strings)

34

:param b: Another indexable object (such as string or list of strings)

35

:return: A list of tuples, one for each line which is matched.

36

[(line_in_a, line_in_b), ...]

37

38

This only matches lines which are unique on both sides.

39

This helps prevent common lines from over influencing match

40

results.

41

The longest common subset uses the Patience Sorting algorithm:

42

http://en.wikipedia.org/wiki/Patience_sorting

43

"""

44

# set index[line in a] = position of line in a unless

45

# unless a is a duplicate, in which case it's set to None

46

index = {}

47

for i in xrange(len(a)):

48

line = a[i]

49

if line in index:

50

index[line] = None

51

else:

52

index[line]= i

53

# make btoa[i] = position of line i in a, unless

54

# that line doesn't occur exactly once in both,

55

# in which case it's set to None

56

btoa = [None] * len(b)

57

index2 = {}

58

for pos, line in enumerate(b):

59

next = index.get(line)

60

if next is not None:

61

if line in index2:

62

# unset the previous mapping, which we now know to

63

# be invalid because the line isn't unique

64

btoa[index2[line]] = None

65

del index[line]

66

else:

67

index2[line] = pos

68

btoa[pos] = next

69

# this is the Patience sorting algorithm

70

# see http://en.wikipedia.org/wiki/Patience_sorting

71

backpointers = [None] * len(b)

72

stacks = []

73

lasts = []

74

k = 0

75

for bpos, apos in enumerate(btoa):

76

if apos is None:

77

continue

78

# as an optimization, check if the next line comes at the end,

79

# because it usually does

80

if stacks and stacks[-1] < apos:

81

k = len(stacks)

82

# as an optimization, check if the next line comes right after

83

# the previous line, because usually it does

84

elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or

85

stacks[k+1] > apos):

86

k += 1

87

else:

88

k = bisect(stacks, apos)

89

if k > 0:

90

backpointers[bpos] = lasts[k-1]

91

if k < len(stacks):

92

stacks[k] = apos

93

lasts[k] = bpos

94

else:

95

stacks.append(apos)

96

lasts.append(bpos)

97

if len(lasts) == 0:

98

return []

99

result = []

100

k = lasts[-1]

101

while k is not None:

102

result.append((btoa[k], k))

103

k = backpointers[k]

104

result.reverse()

105

return result

106

107

108

def recurse_matches(a, b, ahi, bhi, answer, maxrecursion):

109

"""Find all of the matching text in the lines of a and b.

110

111

:param a: A sequence

112

:param b: Another sequence

113

:param ahi: The maximum length of a to check, typically len(a)

114

:param bhi: The maximum length of b to check, typically len(b)

115

:param answer: The return array. Will be filled with tuples

116

indicating [(line_in_a), (line_in_b)]

117

:param maxrecursion: The maximum depth to recurse.

118

Must be a positive integer.

119

:return: None, the return value is in the parameter answer, which

120

should be a list

121

122

"""

123

oldlen = len(answer)

124

if maxrecursion < 0:

125

# this will never happen normally, this check is to prevent DOS attacks

126

return

127

oldlength = len(answer)

128

if len(answer) == 0:

129

alo, blo = 0, 0

130

else:

131

alo, blo = answer[-1]

132

alo += 1

133

blo += 1

134

if alo == ahi or blo == bhi:

135

return

136

for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):

137

# recurse between lines which are unique in each file and match

138

apos += alo

139

bpos += blo

140

recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1)

141

answer.append((apos, bpos))

142

if len(answer) > oldlength:

143

# find matches between the last match and the end

144

recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)

145

elif a[alo] == b[blo]:

146

# find matching lines at the very beginning

147

while alo < ahi and blo < bhi and a[alo] == b[blo]:

148

answer.append((alo, blo))

149

alo += 1

150

blo += 1

151

recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)

152

elif a[ahi - 1] == b[bhi - 1]:

153

# find matching lines at the very end

154

nahi = ahi - 1

155

nbhi = bhi - 1

156

while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:

157

nahi -= 1

158

nbhi -= 1

159

recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1)

160

for i in xrange(ahi - nahi):

161

answer.append((nahi + i, nbhi + i))

162

163

164

class SequenceMatcher(difflib.SequenceMatcher):

165

"""Compare a pair of sequences using longest common subset."""

166

167

def __init__(self, isjunk=None, a='', b=''):

168

if isjunk is not None:

169

raise NotImplementedError('Currently we do not support'

170

' isjunk for sequence matching')

171

difflib.SequenceMatcher.__init__(self, isjunk, a, b)

172

173

def _check_with_diff(self, alo, ahi, blo, bhi, answer):

174

"""Use the original diff algorithm on an unmatched section.

175

176

This will check to make sure the range is worth checking,

177

before doing any work.

178

179

:param alo: The last line that actually matched

180

:param ahi: The next line that actually matches

181

:param blo: Same as alo, only for the 'b' set

182

:param bhi: Same as ahi

183

:param answer: An array which will have the new ranges appended to it

184

:return: None

185

"""

186

# WORKAROUND

187

# recurse_matches has an implementation design

188

# which does not match non-unique lines in the

189

# if they do not touch matching unique lines

190

# so we rerun the regular diff algorithm

191

# if find a large enough chunk.

192

193

# recurse_matches already looked at the direct

194

# neighbors, so we only need to run if there is

195

# enough space to do so

196

if ahi - alo > 2 and bhi - blo > 2:

197

a = self.a[alo+1:ahi-1]

198

b = self.b[blo+1:bhi-1]

199

m = difflib.SequenceMatcher(None, a, b)

200

new_blocks = m.get_matching_blocks()

201

# difflib always adds a final match

202

new_blocks.pop()

203

for blk in new_blocks:

204

answer.append((blk[0]+alo+1,

205

blk[1]+blo+1,

206

blk[2]))

207

208

def __helper(self, alo, ahi, blo, bhi, answer):

209

matches = []

210

a = self.a[alo:ahi]

211

b = self.b[blo:bhi]

212

recurse_matches(a, b, len(a), len(b), matches, 10)

213

# Matches now has individual line pairs of

214

# line A matches line B, at the given offsets

215

216

start_a = start_b = None

217

length = 0

218

for i_a, i_b in matches:

219

if (start_a is not None

220

and (i_a == start_a + length)

221

and (i_b == start_b + length)):

222

length += 1

223

else:

224

# New block

225

if start_a is None:

226

# We need to check from 0,0 until the current match

227

self._check_with_diff(alo-1, i_a+alo, blo-1, i_b+blo,

228

answer)

229

else:

230

answer.append((start_a+alo, start_b+blo, length))

231

self._check_with_diff(start_a+alo+length, i_a+alo,

232

start_b+blo+length, i_b+blo,

233

answer)

234

235

start_a = i_a

236

start_b = i_b

237

length = 1

238

239

if length != 0:

240

answer.append((start_a+alo, start_b+blo, length))

241

self._check_with_diff(start_a+alo+length, ahi+1,

242

start_b+blo+length, bhi+1,

243

answer)

244

if not matches:

245

# Nothing matched, so we need to send the complete text

246

self._check_with_diff(alo-1, ahi+1, blo-1, bhi+1, answer)

247

248

# For consistency sake, make sure all matches are only increasing

249

if __debug__:

250

next_a = -1

251

next_b = -1

252

for a,b,match_len in answer:

253

assert a >= next_a, 'Non increasing matches for a'

254

assert b >= next_b, 'Not increasing matches for b'

255

next_a = a + match_len

256

next_b = b + match_len

257

258

259

# This is a version of unified_diff which only adds a factory parameter

260

# so that you can override the default SequenceMatcher

261

# this has been submitted as a patch to python

262

def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',

263

tofiledate='', n=3, lineterm='\n',

264

sequencematcher=None):

265

r"""

266

Compare two sequences of lines; generate the delta as a unified diff.

267

268

Unified diffs are a compact way of showing line changes and a few

269

lines of context. The number of context lines is set by 'n' which

270

defaults to three.

271

272

By default, the diff control lines (those with ---, +++, or @@) are

273

created with a trailing newline. This is helpful so that inputs

274

created from file.readlines() result in diffs that are suitable for

275

file.writelines() since both the inputs and outputs have trailing

276

newlines.

277

278

For inputs that do not have trailing newlines, set the lineterm

279

argument to "" so that the output will be uniformly newline free.

280

281

The unidiff format normally has a header for filenames and modification

282

times. Any or all of these may be specified using strings for

283

'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification

284

times are normally expressed in the format returned by time.ctime().

285

286

Example:

287

288

>>> for line in unified_diff('one two three four'.split(),

289

... 'zero one tree four'.split(), 'Original', 'Current',

290

... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',

291

... lineterm=''):

292

... print line

293

--- Original Sat Jan 26 23:30:50 1991

294

+++ Current Fri Jun 06 10:20:52 2003

295

@@ -1,4 +1,4 @@

296

+zero

297

one

298

-two

299

-three

300

+tree

301

four

302

"""

303

if sequencematcher is None:

304

sequencematcher = difflib.SequenceMatcher

305

306

started = False

307

for group in sequencematcher(None,a,b).get_grouped_opcodes(n):

308

if not started:

309

yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)

310

yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)

311

started = True

312

i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]

313

yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)

314

for tag, i1, i2, j1, j2 in group:

315

if tag == 'equal':

316

for line in a[i1:i2]:

317

yield ' ' + line

318

continue

319

if tag == 'replace' or tag == 'delete':

320

for line in a[i1:i2]:

321

yield '-' + line

322

if tag == 'replace' or tag == 'insert':

323

for line in b[j1:j2]:

324

yield '+' + line

325

326

327

def unified_diff_files(a, b, sequencematcher=None):

328

"""Generate the diff for two files.

329

"""

330

# Should this actually be an error?

331

if a == b:

332

return []

333

if a == '-':

334

file_a = sys.stdin

335

time_a = time.time()

336

else:

337

file_a = open(a, 'rb')

338

time_a = os.stat(a).st_mtime

339

340

if b == '-':

341

file_b = sys.stdin

342

time_b = time.time()

343

else:

344

file_b = open(b, 'rb')

345

time_b = os.stat(b).st_mtime

346

347

# TODO: Include fromfiledate and tofiledate

348

return unified_diff(file_a.readlines(), file_b.readlines(),

349

fromfile=a, tofile=b,

350

sequencematcher=sequencematcher)

351

352

353

def main(args):

354

import optparse

355

p = optparse.OptionParser(usage='%prog [options] file_a file_b'

356

'\nFiles can be "-" to read from stdin')

357

p.add_option('--cdv', dest='matcher', action='store_const', const='cdv',

358

default='cdv', help='Use the cdv difference algorithm')

359

p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',

360

default='cdv', help='Use python\'s difflib algorithm')

361

362

algorithms = {'cdv':SequenceMatcher, 'difflib':difflib.SequenceMatcher}

363

364

(opts, args) = p.parse_args(args)

365

matcher = algorithms[opts.matcher]

366

367

if len(args) != 2:

368

print 'You must supply 2 filenames to diff'

369

return -1

370

371

for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):

372

sys.stdout.write(line)

373

374

if __name__ == '__main__':

375

sys.exit(main(sys.argv[1:]))