1
from sets import Set as set
3
from bisect import bisect
6
"""Find the longest common subset for unique lines.
8
:param a: An indexable object (such as string or list of strings)
9
:param b: Another indexable object (such as string or list of strings)
10
:return: A list of tuples, one for each line which is matched.
11
[(line_in_a, line_in_b), ...]
13
This only matches lines which are unique on both sides.
14
This helps prevent common lines from over influencing match
16
The longest common subset uses the Patience Sorting algorithm:
17
http://en.wikipedia.org/wiki/Patience_sorting
19
# set index[line in a] = position of line in a unless
20
# unless a is a duplicate, in which case it's set to None
22
for i in xrange(len(a)):
28
# make btoa[i] = position of line i in a, unless
29
# that line doesn't occur exactly once in both,
30
# in which case it's set to None
31
btoa = [None] * len(b)
33
for pos, line in enumerate(b):
34
next = index.get(line)
37
# unset the previous mapping, which we now know to
38
# be invalid because the line isn't unique
39
btoa[index2[line]] = None
44
# this is the Patience sorting algorithm
45
# see http://en.wikipedia.org/wiki/Patience_sorting
46
backpointers = [None] * len(b)
50
for bpos, apos in enumerate(btoa):
53
# as an optimization, check if the next line comes at the end,
54
# because it usually does
55
if stacks and stacks[-1] < apos:
57
# as an optimization, check if the next line comes right after
58
# the previous line, because usually it does
59
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or stacks[k+1] > apos):
62
k = bisect(stacks, apos)
64
backpointers[bpos] = lasts[k-1]
76
result.append((btoa[k], k))
81
assert unique_lcs('', '') == []
82
assert unique_lcs('a', 'a') == [(0, 0)]
83
assert unique_lcs('a', 'b') == []
84
assert unique_lcs('ab', 'ab') == [(0, 0), (1, 1)]
85
assert unique_lcs('abcde', 'cdeab') == [(2, 0), (3, 1), (4, 2)]
86
assert unique_lcs('cdeab', 'abcde') == [(0, 2), (1, 3), (2, 4)]
87
assert unique_lcs('abXde', 'abYde') == [(0, 0), (1, 1), (3, 3), (4, 4)]
88
assert unique_lcs('acbac', 'abc') == [(2, 1)]
90
def recurse_matches(a, b, ahi, bhi, answer, maxrecursion):
91
"""Find all of the matching text in the lines of a and b.
94
:param b: Another sequence
95
:param ahi: The maximum length of a to check, typically len(a)
96
:param bhi: The maximum length of b to check, typically len(b)
97
:param answer: The return array. Will be filled with tuples
98
indicating [(line_in_a), (line_in_b)]
99
:param maxrecursion: The maximum depth to recurse.
100
Must be a positive integer.
101
:return: None, the return value is in the parameter answer, which
106
# this will never happen normally, this check is to prevent DOS attacks
108
oldlength = len(answer)
112
alo, blo = answer[-1]
115
if alo == ahi or blo == bhi:
117
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
118
# recurse between lines which are unique in each file and match
121
recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1)
122
answer.append((apos, bpos))
123
if len(answer) > oldlength:
124
# find matches between the last match and the end
125
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
126
elif a[alo] == b[blo]:
127
# find matching lines at the very beginning
128
while alo < ahi and blo < bhi and a[alo] == b[blo]:
129
answer.append((alo, blo))
132
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
133
elif a[ahi - 1] == b[bhi - 1]:
134
# find matching lines at the very end
137
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
140
recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1)
141
for i in xrange(ahi - nahi):
142
answer.append((nahi + i, nbhi + i))
145
recurse_matches(['a', None, 'b', None, 'c'], ['a', 'a', 'b', 'c', 'c'], 5, 5, a1, 10)
146
assert a1 == [(0, 0), (2, 2), (4, 4)]
148
recurse_matches(['a', 'c', 'b', 'a', 'c'], ['a', 'b', 'c'], 5, 3, a2, 10)
149
assert a2 == [(0, 0), (2, 1), (4, 2)]
157
# {revid: [(lineid, state)]}
158
# states are integers
159
# each line's state starts at 0, then goes to 1, 2, etc.
160
# odd states are when the line is present, even are when it is not
161
# the merge between two states is the greater of the two values
164
def add_revision(self, revid, lines, parents):
165
assert revid not in self.parents
167
assert p in self.parents
168
self.parents[revid] = copy(parents)
170
# match against the weave
171
lines2 = [line for (lineid, line) in self.weave]
172
recurse_matches(lines, lines2, len(lines), len(lines2), matches, 10)
175
s.add(self.weave[b][0])
176
vs = [self._make_vals(p) for p in parents]
177
# calculate which lines had their states changed in this revision
180
for lineid, line in self.weave:
181
state = max([v.get(lineid, 0) for v in vs])
182
if (state & 1 == 1) != (lineid in s):
183
newvals.append((lineid, state + 1))
185
for lineid, line in self.weave:
187
newvals.append((lineid, 1))
192
matches.append((len(lines), len(lines2)))
195
# add current weave lines to the new weave
196
newweave.extend(self.weave[weavepos + 1:b])
198
# add lines which have never appeared before to the weave
199
for i in xrange(revpos + 1, a):
201
newweave.append((lineid, lines[i]))
202
newvals.append((lineid, 1))
204
newweave.append(self.weave[b])
207
self.newstates[revid] = newvals
208
self.weave = newweave
210
def _parents(self, revid):
215
if next not in result:
216
unused.extend(self.parents[next])
220
def _make_vals(self, revid):
221
# return {lineid: state} for the given revision
222
s = self._parents(revid)
225
for p, q in self.newstates[n]:
226
v[p] = max(v.get(p, 0), q)
229
def retrieve_revision(self, revid):
230
# returns a list of strings
231
v = self._make_vals(revid)
232
return [line for (lineid, line) in self.weave if (v.get(lineid, 0) & 1)]
234
def annotate(self, revid):
235
# returns [(line, whether present, [perpetrator])]
236
ps = self._parents(revid)
237
# {lineid: [(parent, state)]}
240
for lineid, state in self.newstates[parent]:
241
byline.setdefault(lineid, []).append((parent, state))
243
for (lineid, line) in self.weave:
246
for (parent, state) in byline.get(lineid, []):
250
elif state == maxstate:
253
result.append((line, (maxstate & 1) == 1, perps))
256
def merge_revisions(self, reva, revb):
258
# non-conflict lines are strings, conflict sections are
259
# ([linesa], [linesb])
260
va = self._make_vals(reva)
261
vb = self._make_vals(revb)
263
awins, bwins = False, False
264
alines, blines = [], []
265
for lineid, line in self.weave:
266
aval, bval = va.get(lineid, 0), vb.get(lineid, 0)
267
if aval & 1 and bval & 1:
268
# append a matched line and the section prior to it
271
r.append((alines, blines))
277
awins, bwins = False, False
278
alines, blines = [], []
279
elif aval & 1 or bval & 1:
280
# extend either side of the potential conflict
281
# section with a non-matching line
290
# add the potential conflict section at the end
292
r.append((alines, blines))
300
w.add_revision(1, ['a', 'b'], [])
301
assert w.retrieve_revision(1) == ['a', 'b']
302
w.add_revision(2, ['a', 'x', 'b'], [1])
303
assert w.retrieve_revision(2) == ['a', 'x', 'b']
304
w.add_revision(3, ['a', 'y', 'b'], [1])
305
assert w.retrieve_revision(3) == ['a', 'y', 'b']
306
assert w.merge_revisions(2, 3) == ['a', (['x'], ['y']), 'b']
307
w.add_revision(4, ['a', 'x', 'b'], [1])
308
w.add_revision(5, ['a', 'z', 'b'], [4])
309
assert w.merge_revisions(2, 5) == ['a', 'z', 'b']
311
w.add_revision('p', ['a', 'b'], [])
312
w.add_revision('q', ['a', 'c'], ['p'])
313
w.add_revision('r', ['a'], ['p'])
314
assert w.annotate('r') == [('a', True, ['p']), ('b', False, ['r'])]