~bzr-pqm/bzr/bzr.dev

2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
1
/*
2
 Copyright (C) 2007 Canonical Ltd
3
4
 This program is free software; you can redistribute it and/or modify
5
 it under the terms of the GNU General Public License as published by
6
 the Free Software Foundation; either version 2 of the License, or
7
 (at your option) any later version.
8
9
 This program is distributed in the hope that it will be useful,
10
 but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 GNU General Public License for more details.
13
14
 You should have received a copy of the GNU General Public License
15
 along with this program; if not, write to the Free Software
16
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
18
 Function equate_lines based on bdiff.c from Mercurial.
19
   Copyright (C) 2005, 2006 Matt Mackall <mpm@selenic.com>
20
21
 Functions unique_lcs/recurse_matches based on _patiencediff_py.py.
22
   Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
23
*/
24
25
26
#include <stdlib.h>
27
#include <string.h>
28
#include <Python.h>
29
30
31
/* http://www.python.org/dev/peps/pep-0353/ */
32
#if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
33
typedef int Py_ssize_t;
34
#define PY_SSIZE_T_MAX INT_MAX
35
#define PY_SSIZE_T_MIN INT_MIN
36
#endif
37
38
39
#if defined(__GNUC__)
40
#   define inline __inline__
41
#elif defined(_MSC_VER)
42
#   define inline __inline
43
#else
44
#   define inline
45
#endif
46
47
48
#define MIN(a, b) (((a) > (b)) ? (b) : (a))
49
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
50
51
52
#define SENTINEL -1
53
54
55
enum {
56
    OP_EQUAL = 0,
57
    OP_INSERT,
58
    OP_DELETE,
59
    OP_REPLACE
60
};
61
62
63
/* values from this array need to correspont to the order of the enum above */
64
static char *opcode_names[] = {
65
    "equal",
66
    "insert",
67
    "delete",
68
    "replace",
69
};
70
71
72
struct line {
3074.2.1 by John Arbash Meinel
Change the C PatienceDiff implementation to support arbitrary objects.
73
    long hash;         /* hash code of the string/object */
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
74
    Py_ssize_t next;   /* next line from the same equivalence class */
75
    Py_ssize_t equiv;  /* equivalence class */
3074.2.8 by John Arbash Meinel
Stop using const, since PyObject_Compare doesn't like it. (Lukáš)
76
    PyObject *data;
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
77
};
78
79
80
struct bucket {
81
    Py_ssize_t a_head;  /* first item in `a` from this equivalence class */
82
    Py_ssize_t a_count;
83
    Py_ssize_t b_head;  /* first item in `b` from this equivalence class */
84
    Py_ssize_t b_count;
85
    Py_ssize_t a_pos;
86
    Py_ssize_t b_pos;
87
};
88
89
90
struct hashtable {
91
    Py_ssize_t last_a_pos;
92
    Py_ssize_t last_b_pos;
93
    Py_ssize_t size;
94
    struct bucket *table;
95
};
96
97
struct matching_line {
98
    Py_ssize_t a;     /* index of the line in `a` */
99
    Py_ssize_t b;     /* index of the line in `b` */
100
};
101
102
103
struct matching_block {
104
    Py_ssize_t a;     /* index of the first line in `a` */
105
    Py_ssize_t b;     /* index of the first line in `b` */
106
    Py_ssize_t len;   /* length of the block */
107
};
108
109
110
struct matching_blocks {
111
    struct matching_block *matches;
112
    Py_ssize_t count;
113
};
114
115
116
struct opcode {
117
    int tag;
118
    Py_ssize_t i1;
119
    Py_ssize_t i2;
120
    Py_ssize_t j1;
121
    Py_ssize_t j2;
122
};
123
124
125
typedef struct {
126
    PyObject_HEAD
127
    Py_ssize_t asize;
128
    Py_ssize_t bsize;
129
    struct line *a;
130
    struct line *b;
131
    struct hashtable hashtable;
132
    Py_ssize_t *backpointers;
133
} PatienceSequenceMatcher;
134
135
136
static inline Py_ssize_t
137
bisect_left(Py_ssize_t *list, Py_ssize_t item, Py_ssize_t lo, Py_ssize_t hi)
138
{
139
    while (lo < hi) {
140
        Py_ssize_t mid = lo / 2 + hi / 2 + (lo % 2 + hi % 2) / 2;
141
        if (list[mid] < item)
142
            lo = mid + 1;
143
        else
144
            hi = mid;
145
    }
146
    return lo;
147
}
148
149
150
static inline int
151
compare_lines(struct line *a, struct line *b)
152
{
3074.2.9 by John Arbash Meinel
Large simplification by ignoring len() and just sticking with py objects.
153
    return ((a->hash != b->hash)
3074.2.8 by John Arbash Meinel
Stop using const, since PyObject_Compare doesn't like it. (Lukáš)
154
            || PyObject_Compare(a->data, b->data));
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
155
}
156
157
158
static inline int
159
find_equivalence_class(struct bucket *hashtable, Py_ssize_t hsize,
160
                       struct line *lines, struct line *ref_lines,
161
                       Py_ssize_t i)
162
{
163
    Py_ssize_t j;
164
    for (j = lines[i].hash & hsize; hashtable[j].b_head != SENTINEL; j = (j + 1) & hsize) {
165
        if (!compare_lines(lines + i, ref_lines + hashtable[j].b_head)) {
166
            break;
167
        }
168
    }
169
    return j;
170
}
171
172
173
static int
174
equate_lines(struct hashtable *result,
175
             struct line *lines_a, struct line *lines_b,
176
             Py_ssize_t asize, Py_ssize_t bsize)
177
{
178
    Py_ssize_t i, j, hsize;
179
    struct bucket *hashtable;
180
181
    /* check for overflow, we need the table to be at least bsize+1 */
182
    if (bsize == PY_SSIZE_T_MAX) {
183
        PyErr_SetNone(PyExc_OverflowError);
184
        return 0;
185
    }
186
187
    /* build a hash table of the next highest power of 2 */
188
    hsize = 1;
189
    while (hsize < bsize + 1)
190
        hsize *= 2;
191
192
    hashtable = (struct bucket *)malloc(sizeof(struct bucket) * hsize);
193
    if (hashtable == NULL) {
194
        PyErr_NoMemory();
195
        return 0;
196
    }
197
198
    /* initialise the hashtable */
199
    for (i = 0; i < hsize; i++) {
200
        hashtable[i].a_count = 0;
201
        hashtable[i].b_count = 0;
202
        hashtable[i].a_head = SENTINEL;
203
        hashtable[i].b_head = SENTINEL;
204
    }
205
    hsize--;
206
207
    /* add lines from lines_b to the hash table chains. iterating
208
       backwards so the matching lines are sorted to the linked list
209
       by the line number (because we are adding new lines to the
210
       head of the list) */
211
    for (i = bsize - 1; i >= 0; i--) {
212
        /* find the first hashtable entry, which is either empty or contains
213
           the same line as lines_b[i] */
214
        j = find_equivalence_class(hashtable, hsize, lines_b, lines_b, i);
215
216
        /* set the equivalence class */
217
        lines_b[i].equiv = j;
218
219
        /* add to the head of the equivalence class */
220
        lines_b[i].next = hashtable[j].b_head;
221
        hashtable[j].b_head = i;
222
        hashtable[j].b_count++;
223
    }
224
225
    /* match items from lines_a to their equivalence class in lines_b.
226
       again, iterating backwards for the right order of the linked lists */
227
    for (i = asize - 1; i >= 0; i--) {
228
        /* find the first hash entry, which is either empty or contains
229
           the same line as lines_a[i] */
230
        j = find_equivalence_class(hashtable, hsize, lines_a, lines_b, i);
231
232
        /* set the equivalence class, even if we are not interested in this
233
           line, because the values are not pre-filled */
234
        lines_a[i].equiv = j;
235
236
        /* we are not interested in lines which are not also in lines_b */
237
        if (hashtable[j].b_head == SENTINEL)
238
            continue;
239
240
        /* add to the head of the equivalence class */
241
        lines_a[i].next = hashtable[j].a_head;
242
        hashtable[j].a_head = i;
243
        hashtable[j].a_count++;
244
    }
245
246
    result->last_a_pos = -1;
247
    result->last_b_pos = -1;
248
    result->size = hsize + 1;
249
    result->table = hashtable;
250
251
    return 1;
252
}
253
254
255
256
/* Finds longest common subsequence of unique lines in a[alo:ahi] and
257
   b[blo:bhi].
258
   Parameter backpointers must have allocated memory for at least
259
   4 * (bhi - blo) ints. */
260
Py_ssize_t
261
unique_lcs(struct matching_line *answer,
262
           struct hashtable *hashtable, Py_ssize_t *backpointers,
263
           struct line *lines_a, struct line *lines_b,
264
           Py_ssize_t alo, Py_ssize_t blo, Py_ssize_t ahi, Py_ssize_t bhi)
265
{
266
    Py_ssize_t i, k, equiv, apos, bpos, norm_apos, norm_bpos, bsize, stacksize;
267
    Py_ssize_t *stacks, *lasts, *btoa;
268
    struct bucket *h;
269
270
    k = 0;
271
    stacksize = 0;
272
    bsize = bhi - blo;
273
    h = hashtable->table;
274
275
    /* "unpack" the allocated memory */
276
    stacks = backpointers + bsize;
277
    lasts = stacks + bsize;
278
    btoa = lasts + bsize;
279
280
    /* initialise the backpointers */
281
    for (i = 0; i < bsize; i++)
282
        backpointers[i] = SENTINEL;
283
284
    if (hashtable->last_a_pos == -1 || hashtable->last_a_pos > alo)
285
        for (i = 0; i < hashtable->size; i++)
286
            h[i].a_pos = h[i].a_head;
287
    hashtable->last_a_pos = alo;
288
289
    if (hashtable->last_b_pos == -1 || hashtable->last_b_pos > blo)
290
        for (i = 0; i < hashtable->size; i++)
291
            h[i].b_pos = h[i].b_head;
292
    hashtable->last_b_pos = blo;
293
294
    for (bpos = blo; bpos < bhi; bpos++) {
295
        equiv = lines_b[bpos].equiv;
296
297
        /* no lines in a or b  */
298
        if (h[equiv].a_count == 0 || h[equiv].b_count == 0)
299
            continue;
300
301
        /* find an unique line in lines_a that matches lines_b[bpos]
302
           if we find more than one line within the range alo:ahi,
303
           jump to the next line from lines_b immediately */
304
        apos = SENTINEL;
305
        /* loop through all lines in the linked list */
306
        for (i = h[equiv].a_pos; i != SENTINEL; i = lines_a[i].next) {
307
            /* the index is lower than alo, the the next line */
308
            if (i < alo) {
309
                h[equiv].a_pos = i;
310
                continue;
311
            }
312
            /* the index is higher than ahi, stop searching */
313
            if (i >= ahi)
314
                break;
315
            /* if the line is within our range, check if it's a duplicate */
316
            if (apos != SENTINEL)
317
                goto nextb;
318
            /* save index to the line */
319
            apos = i;
320
        }
321
        /* this line has no equivalent in lines_a[alo:ahi] */
322
        if (apos == SENTINEL)
323
            goto nextb;
324
325
        /* check for duplicates of this line in lines_b[blo:bhi] */
326
        /* loop through all lines in the linked list */
327
        for (i = h[equiv].b_pos; i != SENTINEL; i = lines_b[i].next) {
328
            /* the index is lower than blo, the the next line */
329
            if (i < blo) {
330
                h[equiv].b_pos = i;
331
                continue;
332
            }
333
            /* the index is higher than bhi, stop searching */
334
            if (i >= bhi)
335
                break;
336
            /* if this isn't the line with started with and it's within
337
               our range, it's a duplicate */
338
            if (i != bpos)
339
                goto nextb;
340
        }
341
342
        /* use normalised indexes ([0,ahi-alo) instead of [alo,ahi))
343
           for the patience sorting algorithm */
344
        norm_bpos = bpos - blo;
345
        norm_apos = apos - alo;
346
        btoa[norm_bpos] = norm_apos;
347
348
        /*
349
        Ok, how does this work...
350
351
        We have a list of matching lines from two lists, a and b. These
352
        matches are stored in variable `btoa`. As we are iterating over this
353
        table by bpos, the lines from b already form an increasing sequence.
354
        We need to "sort" also the lines from a using the patience sorting
355
        algorithm, ignoring the lines which would need to be swapped.
356
357
          http://en.wikipedia.org/wiki/Patience_sorting
358
359
        For each pair of lines, we need to place the line from a on either
360
        an existing pile that has higher value on the top or create a new
361
        pile. Variable `stacks` represents the tops of these piles and in
362
        variable `lasts` we store the lines from b, that correspond to the
363
        lines from a in `stacks`.
364
365
        Whenever we place a new line on top of a pile, we store a
366
        backpointer to the line (b) from top of the previous pile. This means
367
        that after the loop, variable `backpointers` will contain an index
368
        to the previous matching lines that forms an increasing sequence
369
        (over both indexes a and b) with the current matching lines. If
370
        either index a or b of the previous matching lines would be higher
371
        than indexes of the current one or if the indexes of the current
372
        one are 0, it will contain SENTINEL.
373
374
        To construct the LCS, we will just need to follow these backpointers
375
        from the top of the last pile and stop when we reach SENTINEL.
376
        */
377
378
        /* as an optimization, check if the next line comes at the end,
379
           because it usually does */
380
        if (stacksize && stacks[stacksize - 1] < norm_apos)
381
            k = stacksize;
382
        /* as an optimization, check if the next line comes right after
383
           the previous line, because usually it does */
384
        else if (stacksize && (stacks[k] < norm_apos) &&
385
                 (k == stacksize - 1 || stacks[k + 1] > norm_apos))
386
            k += 1;
387
        else
388
            k = bisect_left(stacks, norm_apos, 0, stacksize);
389
390
        if (k > 0)
391
            backpointers[norm_bpos] = lasts[k - 1];
392
393
        if (k < stacksize) {
394
            stacks[k] = norm_apos;
395
            lasts[k] = norm_bpos;
396
        }
397
        else {
398
            stacks[stacksize] = norm_apos;
399
            lasts[stacksize] = norm_bpos;
400
            stacksize += 1;
401
        }
402
403
404
nextb:
405
        ;
406
    }
407
408
    if (stacksize == 0)
409
        return 0;
410
411
    /* backtrace the structures to find the LCS */
412
    i = 0;
413
    k = lasts[stacksize - 1];
414
    while (k != SENTINEL) {
415
        answer[i].a = btoa[k];
416
        answer[i].b = k;
417
        k = backpointers[k];
418
        i++;
419
    }
420
421
    return i;
422
}
423
424
/* Adds a new line to the list of matching blocks, either extending the
425
   current block or adding a new one. */
426
static inline void
427
add_matching_line(struct matching_blocks *answer, Py_ssize_t a, Py_ssize_t b)
428
{
429
    Py_ssize_t last_index = answer->count - 1;
430
    if ((last_index >= 0) &&
431
        (a == answer->matches[last_index].a +
432
              answer->matches[last_index].len) &&
433
        (b == answer->matches[last_index].b +
434
              answer->matches[last_index].len)) {
435
        /* enlarge the last block */
436
        answer->matches[last_index].len++;
437
    }
438
    else {
439
        /* create a new block */
440
        last_index++;
441
        answer->matches[last_index].a = a;
442
        answer->matches[last_index].b = b;
443
        answer->matches[last_index].len = 1;
444
        answer->count++;
445
    }
446
}
447
448
449
static int
450
recurse_matches(struct matching_blocks *answer, struct hashtable *hashtable,
451
                Py_ssize_t *backpointers, struct line *a, struct line *b,
452
                Py_ssize_t alo, Py_ssize_t blo, Py_ssize_t ahi, Py_ssize_t bhi,
453
                int maxrecursion)
454
{
455
    int res;
456
    Py_ssize_t new, last_a_pos, last_b_pos, lcs_size, nahi, nbhi, i, apos, bpos;
457
    struct matching_line *lcs;
458
459
    if (maxrecursion < 0)
460
        return 1;
461
462
    if (alo == ahi || blo == bhi)
463
        return 1;
464
465
    new = 0;
466
    last_a_pos = alo - 1;
467
    last_b_pos = blo - 1;
468
469
    lcs = (struct matching_line *)malloc(sizeof(struct matching_line) * (bhi - blo));
470
    if (lcs == NULL)
471
        return 0;
472
473
    lcs_size = unique_lcs(lcs, hashtable, backpointers, a, b, alo, blo, ahi, bhi);
474
475
    /* recurse between lines which are unique in each file and match */
476
    for (i = lcs_size - 1; i >= 0; i--) {
477
        apos = alo + lcs[i].a;
478
        bpos = blo + lcs[i].b;
479
        if (last_a_pos + 1 != apos || last_b_pos + 1 != bpos) {
480
            res = recurse_matches(answer, hashtable,
481
                                  backpointers, a, b,
482
                                  last_a_pos + 1, last_b_pos + 1,
483
                                  apos, bpos, maxrecursion - 1);
484
            if (!res)
485
                goto error;
486
        }
487
        last_a_pos = apos;
488
        last_b_pos = bpos;
489
        add_matching_line(answer, apos, bpos);
490
        new = 1;
491
    }
492
493
    free(lcs);
494
    lcs = NULL;
495
496
    /* find matches between the last match and the end */
497
    if (new > 0) {
498
        res = recurse_matches(answer, hashtable,
499
                              backpointers, a, b,
500
                              last_a_pos + 1, last_b_pos + 1,
501
                              ahi, bhi, maxrecursion - 1);
502
        if (!res)
503
            goto error;
504
    }
505
506
507
    /* find matching lines at the very beginning */
508
    else if (a[alo].equiv == b[blo].equiv) {
509
        while (alo < ahi && blo < bhi && a[alo].equiv == b[blo].equiv)
510
            add_matching_line(answer, alo++, blo++);
511
        res = recurse_matches(answer, hashtable,
512
                              backpointers, a, b,
513
                              alo, blo, ahi, bhi, maxrecursion - 1);
514
        if (!res)
515
            goto error;
516
    }
517
518
    /* find matching lines at the very end */
519
    else if (a[ahi - 1].equiv == b[bhi - 1].equiv) {
520
        nahi = ahi - 1;
521
        nbhi = bhi - 1;
522
        while (nahi > alo && nbhi > blo && a[nahi - 1].equiv == b[nbhi - 1].equiv) {
523
            nahi--;
524
            nbhi--;
525
        }
526
        res = recurse_matches(answer, hashtable,
527
                              backpointers, a, b,
528
                              last_a_pos + 1, last_b_pos + 1,
529
                              nahi, nbhi, maxrecursion - 1);
530
        if (!res)
531
            goto error;
532
        for (i = 0; i < ahi - nahi; i++)
533
            add_matching_line(answer, nahi + i, nbhi + i);
534
    }
535
536
    return 1;
537
538
error:
539
    free(lcs);
540
    return 0;
541
}
542
543
544
static Py_ssize_t
545
load_lines(PyObject *orig, struct line **lines)
546
{
3074.2.9 by John Arbash Meinel
Large simplification by ignoring len() and just sticking with py objects.
547
    Py_ssize_t size, i;
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
548
    struct line *line;
3074.2.9 by John Arbash Meinel
Large simplification by ignoring len() and just sticking with py objects.
549
    PyObject *seq, *item;
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
550
551
    seq = PySequence_Fast(orig, "sequence expected");
552
    if (seq == NULL) {
553
        return -1;
554
    }
555
556
    size = PySequence_Fast_GET_SIZE(seq);
557
    if (size == 0) {
558
        Py_DECREF(seq);
559
        return 0;
560
    }
561
562
    line = *lines = (struct line *)malloc(sizeof(struct line) * size);
563
    if (line == NULL) {
564
        PyErr_NoMemory();
565
        Py_DECREF(seq);
566
        return -1;
567
    }
568
569
    for (i = 0; i < size; i++) {
570
        item = PySequence_Fast_GET_ITEM(seq, i);
3074.2.5 by John Arbash Meinel
using PyObject_Hash, but memcmp if both sides are strings
571
        line->data = item;
3074.2.9 by John Arbash Meinel
Large simplification by ignoring len() and just sticking with py objects.
572
        line->hash = PyObject_Hash(item);
3074.2.3 by John Arbash Meinel
Enable some error checking, and small amount of code cleanup.
573
        if (line->hash == (-1)) {
574
            /* Propogate the hash exception */
3074.2.4 by John Arbash Meinel
Don't leak references to the sequence object.
575
            size = -1;
576
            goto cleanup;
3074.2.3 by John Arbash Meinel
Enable some error checking, and small amount of code cleanup.
577
        }
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
578
        line->next = SENTINEL;
579
        line++;
580
    }
581
3074.2.4 by John Arbash Meinel
Don't leak references to the sequence object.
582
    cleanup:
2781.1.1 by Martin Pool
merge cpatiencediff from Lukas
583
    Py_DECREF(seq);
584
    return size;
585
}
586
587
588
static PyObject *
589
py_unique_lcs(PyObject *self, PyObject *args)
590
{
591
    PyObject *aseq, *bseq, *res, *item;
592
    Py_ssize_t asize, bsize, i, nmatches, *backpointers = NULL;
593
    struct line *a = NULL, *b = NULL;
594
    struct matching_line *matches = NULL;
595
    struct hashtable hashtable;
596
597
    if (!PyArg_ParseTuple(args, "OO", &aseq, &bseq))
598
        return NULL;
599
600
    hashtable.table = NULL;
601
602
    asize = load_lines(aseq, &a);
603
    bsize = load_lines(bseq, &b);
604
    if (asize == -1 || bsize == -1)
605
        goto error;
606
607
    if (!equate_lines(&hashtable, a, b, asize, bsize))
608
        goto error;
609
610
    matches = (struct matching_line *)malloc(sizeof(struct matching_line) * bsize);
611
    if (matches == NULL)
612
        goto error;
613
614
    backpointers = (Py_ssize_t *)malloc(sizeof(Py_ssize_t) * bsize * 4);
615
    if (backpointers == NULL)
616
        goto error;
617
618
    nmatches = unique_lcs(matches, &hashtable, backpointers, a, b, 0, 0, asize, bsize);
619
620
    res = PyList_New(nmatches);
621
    for (i = 0; i < nmatches; i++) {
622
#if PY_VERSION_HEX < 0x02050000
623
        item = Py_BuildValue("ii", matches[nmatches - i - 1].a, matches[nmatches - i - 1].b);
624
#else
625
        item = Py_BuildValue("nn", matches[nmatches - i - 1].a, matches[nmatches - i - 1].b);
626
#endif
627
        if (item == NULL)
628
            goto error;
629
        if (PyList_SetItem(res, i, item) != 0)
630
            goto error;
631
    }
632
633
    free(backpointers);
634
    free(matches);
635
    free(hashtable.table);
636
    free(b);
637
    free(a);
638
    return res;
639
640
error:
641
    free(backpointers);
642
    free(matches);
643
    free(hashtable.table);
644
    free(b);
645
    free(a);
646
    return NULL;
647
}
648
649
650
static PyObject *
651
py_recurse_matches(PyObject *self, PyObject *args)
652
{
653
    PyObject *aseq, *bseq, *item, *answer;
654
    int maxrecursion, res;
655
    Py_ssize_t i, j, asize, bsize, alo, blo, ahi, bhi;
656
    Py_ssize_t *backpointers = NULL;
657
    struct line *a = NULL, *b = NULL;
658
    struct hashtable hashtable;
659
    struct matching_blocks matches;
660
661
#if PY_VERSION_HEX < 0x02050000
662
    if (!PyArg_ParseTuple(args, "OOiiiiOi", &aseq, &bseq, &alo, &blo,
663
                          &ahi, &bhi, &answer, &maxrecursion))
664
#else
665
    if (!PyArg_ParseTuple(args, "OOnnnnOi", &aseq, &bseq, &alo, &blo,
666
                          &ahi, &bhi, &answer, &maxrecursion))
667
#endif
668
        return NULL;
669
670
    hashtable.table = NULL;
671
    matches.matches = NULL;
672
673
    asize = load_lines(aseq, &a);
674
    bsize = load_lines(bseq, &b);
675
    if (asize == -1 || bsize == -1)
676
        goto error;
677
678
    if (!equate_lines(&hashtable, a, b, asize, bsize))
679
        goto error;
680
681
    matches.count = 0;
682
    matches.matches = (struct matching_block *)malloc(sizeof(struct matching_block) * bsize);
683
    if (matches.matches == NULL)
684
        goto error;
685
686
    backpointers = (Py_ssize_t *)malloc(sizeof(Py_ssize_t) * bsize * 4);
687
    if (backpointers == NULL)
688
        goto error;
689
690
    res = recurse_matches(&matches, &hashtable, backpointers,
691
                          a, b, alo, blo, ahi, bhi, maxrecursion);
692
    if (!res)
693
        goto error;
694
695
    for (i = 0; i < matches.count; i++) {
696
        for (j = 0; j < matches.matches[i].len; j++) {
697
#if PY_VERSION_HEX < 0x02050000
698
            item = Py_BuildValue("ii", matches.matches[i].a + j,
699
                                 matches.matches[i].b + j);
700
#else
701
            item = Py_BuildValue("nn", matches.matches[i].a + j,
702
                                 matches.matches[i].b + j);
703
#endif
704
            if (item == NULL)
705
                goto error;
706
            if (PyList_Append(answer, item) != 0)
707
                goto error;
708
        }
709
    }
710
711
    free(backpointers);
712
    free(matches.matches);
713
    free(hashtable.table);
714
    free(b);
715
    free(a);
716
    Py_RETURN_NONE;
717
718
error:
719
    free(backpointers);
720
    free(matches.matches);
721
    free(hashtable.table);
722
    free(b);
723
    free(a);
724
    return NULL;
725
}
726
727
728
static PyObject *
729
PatienceSequenceMatcher_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
730
{
731
    PyObject *junk, *a, *b;
732
    PatienceSequenceMatcher *self;
733
734
    self = (PatienceSequenceMatcher *)type->tp_alloc(type, 0);
735
    if (self != NULL) {
736
737
        if (!PyArg_ParseTuple(args, "OOO", &junk, &a, &b)) {
738
            Py_DECREF(self);
739
            return NULL;
740
        }
741
742
        self->asize = load_lines(a, &(self->a));
743
        self->bsize = load_lines(b, &(self->b));
744
745
        if (self->asize == -1 || self->bsize == -1) {
746
            Py_DECREF(self);
747
            return NULL;
748
        }
749
750
        if (!equate_lines(&self->hashtable, self->a, self->b, self->asize, self->bsize)) {
751
            Py_DECREF(self);
752
            return NULL;
753
        }
754
755
        self->backpointers = (Py_ssize_t *)malloc(sizeof(Py_ssize_t) * self->bsize * 4);
756
        if (self->backpointers == NULL) {
757
            Py_DECREF(self);
758
            return NULL;
759
        }
760
761
    }
762
763
    return (PyObject *)self;
764
}
765
766
767
static void
768
PatienceSequenceMatcher_dealloc(PatienceSequenceMatcher* self)
769
{
770
    free(self->backpointers);
771
    free(self->hashtable.table);
772
    free(self->b);
773
    free(self->a);
774
    self->ob_type->tp_free((PyObject *)self);
775
}
776
777
778
static char PatienceSequenceMatcher_get_matching_blocks_doc[] =
779
    "Return list of triples describing matching subsequences.\n"
780
    "\n"
781
    "Each triple is of the form (i, j, n), and means that\n"
782
    "a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in\n"
783
    "i and in j.\n"
784
    "\n"
785
    "The last triple is a dummy, (len(a), len(b), 0), and is the only\n"
786
    "triple with n==0.\n"
787
    "\n"
788
    ">>> s = PatienceSequenceMatcher(None, \"abxcd\", \"abcd\")\n"
789
    ">>> s.get_matching_blocks()\n"
790
    "[(0, 0, 2), (3, 2, 2), (5, 4, 0)]\n";
791
792
static PyObject *
793
PatienceSequenceMatcher_get_matching_blocks(PatienceSequenceMatcher* self)
794
{
795
    PyObject *answer, *item;
796
    int res;
797
    Py_ssize_t i;
798
    struct matching_blocks matches;
799
800
    matches.count = 0;
801
    matches.matches = (struct matching_block *)malloc(sizeof(struct matching_block) * self->bsize);
802
    if (matches.matches == NULL)
803
        return PyErr_NoMemory();
804
805
    res = recurse_matches(&matches, &self->hashtable, self->backpointers,
806
                          self->a, self->b, 0, 0,
807
                          self->asize, self->bsize, 10);
808
    if (!res) {
809
        free(matches.matches);
810
        return PyErr_NoMemory();
811
    }
812
813
    answer = PyList_New(matches.count + 1);
814
    if (answer == NULL) {
815
        free(matches.matches);
816
        return NULL;
817
    }
818
819
    for (i = 0; i < matches.count; i++) {
820
#if PY_VERSION_HEX < 0x02050000
821
        item = Py_BuildValue("iii", matches.matches[i].a,
822
                             matches.matches[i].b, matches.matches[i].len);
823
#else
824
        item = Py_BuildValue("nnn", matches.matches[i].a,
825
                             matches.matches[i].b, matches.matches[i].len);
826
#endif
827
        if (item == NULL)
828
            goto error;
829
        if (PyList_SetItem(answer, i, item) != 0)
830
            goto error;
831
    }
832
#if PY_VERSION_HEX < 0x02050000
833
    item = Py_BuildValue("iii", self->asize, self->bsize, 0);
834
#else
835
    item = Py_BuildValue("nnn", self->asize, self->bsize, 0);
836
#endif
837
    if (item == NULL)
838
        goto error;
839
    if (PyList_SetItem(answer, i, item) != 0)
840
        goto error;
841
842
    free(matches.matches);
843
    return answer;
844
845
error:
846
    free(matches.matches);
847
    Py_DECREF(answer);
848
    return NULL;
849
}
850
851
852
static char PatienceSequenceMatcher_get_opcodes_doc[] =
853
    "Return list of 5-tuples describing how to turn a into b.\n"
854
    "\n"
855
    "Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple\n"
856
    "has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the\n"
857
    "tuple preceding it, and likewise for j1 == the previous j2.\n"
858
    "\n"
859
    "The tags are strings, with these meanings:\n"
860
    "\n"
861
    "'replace':  a[i1:i2] should be replaced by b[j1:j2]\n"
862
    "'delete':   a[i1:i2] should be deleted.\n"
863
    "               Note that j1==j2 in this case.\n"
864
    "'insert':   b[j1:j2] should be inserted at a[i1:i1].\n"
865
    "               Note that i1==i2 in this case.\n"
866
    "'equal':    a[i1:i2] == b[j1:j2]\n"
867
    "\n"
868
    ">>> a = \"qabxcd\"\n"
869
    ">>> b = \"abycdf\"\n"
870
    ">>> s = PatienceSequenceMatcher(None, a, b)\n"
871
    ">>> for tag, i1, i2, j1, j2 in s.get_opcodes():\n"
872
    "...    print (\"%7s a[%d:%d] (%s) b[%d:%d] (%s)\" %\n"
873
    "...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))\n"
874
    " delete a[0:1] (q) b[0:0] ()\n"
875
    "  equal a[1:3] (ab) b[0:2] (ab)\n"
876
    "replace a[3:4] (x) b[2:3] (y)\n"
877
    "  equal a[4:6] (cd) b[3:5] (cd)\n"
878
    " insert a[6:6] () b[5:6] (f)\n";
879
880
static PyObject *
881
PatienceSequenceMatcher_get_opcodes(PatienceSequenceMatcher* self)
882
{
883
    PyObject *answer, *item;
884
    Py_ssize_t i, j, k, ai, bj;
885
    int tag, res;
886
    struct matching_blocks matches;
887
888
    matches.count = 0;
889
    matches.matches = (struct matching_block *)malloc(sizeof(struct matching_block) * (self->bsize + 1));
890
    if (matches.matches == NULL)
891
        return PyErr_NoMemory();
892
893
    res = recurse_matches(&matches, &self->hashtable, self->backpointers,
894
                          self->a, self->b, 0, 0,
895
                          self->asize, self->bsize, 10);
896
    if (!res) {
897
        free(matches.matches);
898
        return PyErr_NoMemory();
899
    }
900
901
    matches.matches[matches.count].a = self->asize;
902
    matches.matches[matches.count].b = self->bsize;
903
    matches.matches[matches.count].len = 0;
904
    matches.count++;
905
906
    answer = PyList_New(0);
907
    if (answer == NULL) {
908
        free(matches.matches);
909
        return NULL;
910
    }
911
912
    i = j = 0;
913
    for (k = 0; k < matches.count; k++) {
914
        ai = matches.matches[k].a;
915
        bj = matches.matches[k].b;
916
917
        tag = -1;
918
        if (i < ai && j < bj)
919
            tag = OP_REPLACE;
920
        else if (i < ai)
921
            tag = OP_DELETE;
922
        else if (j < bj)
923
            tag = OP_INSERT;
924
925
        if (tag != -1) {
926
#if PY_VERSION_HEX < 0x02050000
927
            item = Py_BuildValue("siiii", opcode_names[tag], i, ai, j, bj);
928
#else
929
            item = Py_BuildValue("snnnn", opcode_names[tag], i, ai, j, bj);
930
#endif
931
            if (item == NULL)
932
                goto error;
933
            if (PyList_Append(answer, item) != 0)
934
                goto error;
935
        }
936
937
        i = ai + matches.matches[k].len;
938
        j = bj + matches.matches[k].len;
939
940
        if (matches.matches[k].len > 0) {
941
#if PY_VERSION_HEX < 0x02050000
942
            item = Py_BuildValue("siiii", opcode_names[OP_EQUAL], ai, i, bj, j);
943
#else
944
            item = Py_BuildValue("snnnn", opcode_names[OP_EQUAL], ai, i, bj, j);
945
#endif
946
            if (item == NULL)
947
                goto error;
948
            if (PyList_Append(answer, item) != 0)
949
                goto error;
950
        }
951
    }
952
953
    free(matches.matches);
954
    return answer;
955
956
error:
957
    free(matches.matches);
958
    Py_DECREF(answer);
959
    return NULL;
960
}
961
962
963
static char PatienceSequenceMatcher_get_grouped_opcodes_doc[] =
964
    "Isolate change clusters by eliminating ranges with no changes.\n"
965
    "\n"
966
    "Return a list of groups with upto n lines of context.\n"
967
    "Each group is in the same format as returned by get_opcodes().\n"
968
    "\n"
969
    ">>> from pprint import pprint\n"
970
    ">>> a = map(str, range(1,40))\n"
971
    ">>> b = a[:]\n"
972
    ">>> b[8:8] = ['i']     # Make an insertion\n"
973
    ">>> b[20] += 'x'       # Make a replacement\n"
974
    ">>> b[23:28] = []      # Make a deletion\n"
975
    ">>> b[30] += 'y'       # Make another replacement\n"
976
    ">>> pprint(PatienceSequenceMatcher(None,a,b).get_grouped_opcodes())\n"
977
    "[[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],\n"
978
    " [('equal', 16, 19, 17, 20),\n"
979
    "  ('replace', 19, 20, 20, 21),\n"
980
    "  ('equal', 20, 22, 21, 23),\n"
981
    "  ('delete', 22, 27, 23, 23),\n"
982
    "  ('equal', 27, 30, 23, 26)],\n"
983
    " [('equal', 31, 34, 27, 30),\n"
984
    "  ('replace', 34, 35, 30, 31),\n"
985
    "  ('equal', 35, 38, 31, 34)]]\n";
986
987
static PyObject *
988
PatienceSequenceMatcher_get_grouped_opcodes(PatienceSequenceMatcher* self,
989
                                            PyObject *args)
990
{
991
    PyObject *answer, *group, *item;
992
    Py_ssize_t i, j, k, ai, bj, size, ncodes, tag;
993
    Py_ssize_t i1, i2, j1, j2;
994
    int n = 3, nn, res;
995
    struct matching_blocks matches;
996
    struct opcode *codes;
997
998
    if (!PyArg_ParseTuple(args, "|i", &n))
999
        return NULL;
1000
1001
    matches.count = 0;
1002
    matches.matches = (struct matching_block *)malloc(sizeof(struct matching_block) * (self->bsize + 1));
1003
    if (matches.matches == NULL)
1004
        return PyErr_NoMemory();
1005
1006
    res = recurse_matches(&matches, &self->hashtable, self->backpointers,
1007
                          self->a, self->b, 0, 0,
1008
                          self->asize, self->bsize, 10);
1009
    if (!res) {
1010
        free(matches.matches);
1011
        return PyErr_NoMemory();
1012
    }
1013
1014
    matches.matches[matches.count].a = self->asize;
1015
    matches.matches[matches.count].b = self->bsize;
1016
    matches.matches[matches.count].len = 0;
1017
    matches.count++;
1018
1019
    ncodes = 0;
1020
    codes = (struct opcode *)malloc(sizeof(struct opcode) * matches.count * 2);
1021
    if (codes == NULL) {
1022
        free(matches.matches);
1023
        return PyErr_NoMemory();
1024
    }
1025
1026
    i = j = 0;
1027
    for (k = 0; k < matches.count; k++) {
1028
        ai = matches.matches[k].a;
1029
        bj = matches.matches[k].b;
1030
1031
        tag = -1;
1032
        if (i < ai && j < bj)
1033
            tag = OP_REPLACE;
1034
        else if (i < ai)
1035
            tag = OP_DELETE;
1036
        else if (j < bj)
1037
            tag = OP_INSERT;
1038
1039
        if (tag != -1) {
1040
            codes[ncodes].tag = tag;
1041
            codes[ncodes].i1 = i;
1042
            codes[ncodes].i2 = ai;
1043
            codes[ncodes].j1 = j;
1044
            codes[ncodes].j2 = bj;
1045
            ncodes++;
1046
        }
1047
1048
        i = ai + matches.matches[k].len;
1049
        j = bj + matches.matches[k].len;
1050
1051
        if (matches.matches[k].len > 0) {
1052
            codes[ncodes].tag = OP_EQUAL;
1053
            codes[ncodes].i1 = ai;
1054
            codes[ncodes].i2 = i;
1055
            codes[ncodes].j1 = bj;
1056
            codes[ncodes].j2 = j;
1057
            ncodes++;
1058
        }
1059
    }
1060
1061
    if (ncodes == 0) {
1062
        codes[ncodes].tag = OP_EQUAL;
1063
        codes[ncodes].i1 = 0;
1064
        codes[ncodes].i2 = 1;
1065
        codes[ncodes].j1 = 0;
1066
        codes[ncodes].j2 = 1;
1067
        ncodes++;
1068
    }
1069
1070
    /* fixup leading and trailing groups if they show no changes. */
1071
    if (codes[0].tag == OP_EQUAL) {
1072
        codes[0].i1 = MAX(codes[0].i1, codes[0].i2 - n);
1073
        codes[0].j1 = MAX(codes[0].j1, codes[0].j2 - n);
1074
    }
1075
    if (codes[ncodes - 1].tag == OP_EQUAL) {
1076
        codes[ncodes - 1].i2 = MIN(codes[ncodes - 1].i2,
1077
                                   codes[ncodes - 1].i1 + n);
1078
        codes[ncodes - 1].j2 = MIN(codes[ncodes - 1].j2,
1079
                                   codes[ncodes - 1].j1 + n);
1080
    }
1081
1082
    group = NULL;
1083
1084
    answer = PyList_New(0);
1085
    if (answer == NULL)
1086
        goto error;
1087
1088
    group = PyList_New(0);
1089
    if (group == NULL)
1090
        goto error;
1091
1092
    nn = n + n;
1093
    tag = -1;
1094
    for (i = 0; i < ncodes; i++) {
1095
        tag = codes[i].tag;
1096
        i1 = codes[i].i1;
1097
        i2 = codes[i].i2;
1098
        j1 = codes[i].j1;
1099
        j2 = codes[i].j2;
1100
        /* end the current group and start a new one whenever
1101
           there is a large range with no changes. */
1102
        if (tag == OP_EQUAL && i2 - i1 > nn) {
1103
#if PY_VERSION_HEX < 0x02050000
1104
            item = Py_BuildValue("siiii", opcode_names[tag],
1105
                                  i1, MIN(i2, i1 + n), j1, MIN(j2, j1 + n));
1106
#else
1107
            item = Py_BuildValue("snnnn", opcode_names[tag],
1108
                                  i1, MIN(i2, i1 + n), j1, MIN(j2, j1 + n));
1109
#endif
1110
            if (item == NULL)
1111
                goto error;
1112
            if (PyList_Append(group, item) != 0)
1113
                goto error;
1114
            if (PyList_Append(answer, group) != 0)
1115
                goto error;
1116
            group = PyList_New(0);
1117
            if (group == NULL)
1118
                goto error;
1119
            i1 = MAX(i1, i2 - n);
1120
            j1 = MAX(j1, j2 - n);
1121
        }
1122
#if PY_VERSION_HEX < 0x02050000
1123
        item = Py_BuildValue("siiii", opcode_names[tag], i1, i2, j1 ,j2);
1124
#else
1125
        item = Py_BuildValue("snnnn", opcode_names[tag], i1, i2, j1 ,j2);
1126
#endif
1127
        if (item == NULL)
1128
            goto error;
1129
        if (PyList_Append(group, item) != 0)
1130
            goto error;
1131
    }
1132
    size = PyList_Size(group);
1133
    if (size > 0 && !(size == 1 && tag == OP_EQUAL)) {
1134
        if (PyList_Append(answer, group) != 0)
1135
            goto error;
1136
    }
1137
    else
1138
        Py_DECREF(group);
1139
1140
    free(codes);
1141
    free(matches.matches);
1142
    return answer;
1143
1144
error:
1145
    free(codes);
1146
    free(matches.matches);
1147
    Py_DECREF(group);
1148
    Py_DECREF(answer);
1149
    return NULL;
1150
}
1151
1152
1153
static PyMethodDef PatienceSequenceMatcher_methods[] = {
1154
    {"get_matching_blocks",
1155
     (PyCFunction)PatienceSequenceMatcher_get_matching_blocks,
1156
     METH_NOARGS,
1157
     PatienceSequenceMatcher_get_matching_blocks_doc},
1158
    {"get_opcodes",
1159
     (PyCFunction)PatienceSequenceMatcher_get_opcodes,
1160
     METH_NOARGS,
1161
     PatienceSequenceMatcher_get_opcodes_doc},
1162
    {"get_grouped_opcodes",
1163
     (PyCFunction)PatienceSequenceMatcher_get_grouped_opcodes,
1164
     METH_VARARGS,
1165
     PatienceSequenceMatcher_get_grouped_opcodes_doc},
1166
    {NULL}
1167
};
1168
1169
1170
static char PatienceSequenceMatcher_doc[] =
1171
    "C implementation of PatienceSequenceMatcher";
1172
1173
1174
static PyTypeObject PatienceSequenceMatcherType = {
1175
    PyObject_HEAD_INIT(NULL)
1176
    0,                                           /* ob_size */
1177
    "PatienceSequenceMatcher",                   /* tp_name */
1178
    sizeof(PatienceSequenceMatcher),             /* tp_basicsize */
1179
    0,                                           /* tp_itemsize */
1180
    (destructor)PatienceSequenceMatcher_dealloc, /* tp_dealloc */
1181
    0,                                           /* tp_print */
1182
    0,                                           /* tp_getattr */
1183
    0,                                           /* tp_setattr */
1184
    0,                                           /* tp_compare */
1185
    0,                                           /* tp_repr */
1186
    0,                                           /* tp_as_number */
1187
    0,                                           /* tp_as_sequence */
1188
    0,                                           /* tp_as_mapping */
1189
    0,                                           /* tp_hash */
1190
    0,                                           /* tp_call */
1191
    0,                                           /* tp_str */
1192
    0,                                           /* tp_getattro */
1193
    0,                                           /* tp_setattro */
1194
    0,                                           /* tp_as_buffer */
1195
    Py_TPFLAGS_DEFAULT,                          /* tp_flags*/
1196
    PatienceSequenceMatcher_doc,                 /* tp_doc */
1197
    0,                                           /* tp_traverse */
1198
    0,                                           /* tp_clear */
1199
    0,                                           /* tp_richcompare */
1200
    0,                                           /* tp_weaklistoffset */
1201
    0,                                           /* tp_iter */
1202
    0,                                           /* tp_iternext */
1203
    PatienceSequenceMatcher_methods,             /* tp_methods */
1204
    0,                                           /* tp_members */
1205
    0,                                           /* tp_getset */
1206
    0,                                           /* tp_base */
1207
    0,                                           /* tp_dict */
1208
    0,                                           /* tp_descr_get */
1209
    0,                                           /* tp_descr_set */
1210
    0,                                           /* tp_dictoffset */
1211
    0,                                           /* tp_init */
1212
    0,                                           /* tp_alloc */
1213
    PatienceSequenceMatcher_new,                 /* tp_new */
1214
};
1215
1216
1217
static PyMethodDef cpatiencediff_methods[] = {
1218
    {"unique_lcs_c", py_unique_lcs, METH_VARARGS},
1219
    {"recurse_matches_c", py_recurse_matches, METH_VARARGS},
1220
    {NULL, NULL}
1221
};
1222
1223
1224
PyMODINIT_FUNC
1225
init_patiencediff_c(void)
1226
{
1227
    PyObject* m;
1228
1229
    if (PyType_Ready(&PatienceSequenceMatcherType) < 0)
1230
        return;
1231
1232
    m = Py_InitModule3("_patiencediff_c", cpatiencediff_methods,
1233
                       "C implementation of PatienceSequenceMatcher");
1234
    if (m == NULL)
1235
      return;
1236
1237
    Py_INCREF(&PatienceSequenceMatcherType);
1238
    PyModule_AddObject(m, "PatienceSequenceMatcher_c",
1239
                       (PyObject *)&PatienceSequenceMatcherType);
1240
}