~bzr-pqm/bzr/bzr.dev

0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1
/*
2
 * diff-delta.c: generate a delta between two buffers
3
 *
4
 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
 * http://www.xmailserver.org/xdiff-lib.html
6
 *
7
 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
8
 * Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
9
 *
10
 * This code is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License version 2 as
12
 * published by the Free Software Foundation.
13
 */
14
15
#include "delta.h"
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
16
#include <stdlib.h>
17
#include <string.h>
0.23.5 by John Arbash Meinel
Minor changes to get diff-delta.c and patch-delta.c to compile.
18
#include <assert.h>
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
19
20
/* maximum hash entry list for the same hash bucket */
21
#define HASH_LIMIT 64
22
23
#define RABIN_SHIFT 23
24
#define RABIN_WINDOW 16
25
26
static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
27
    0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
28
    0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
29
    0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
30
    0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
31
    0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
32
    0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
33
    0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
34
    0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
35
    0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
36
    0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
37
    0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
38
    0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
39
    0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
40
    0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
41
    0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
42
    0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
43
    0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
44
    0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
45
    0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
46
    0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
47
    0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
48
    0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
49
    0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
50
    0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
51
    0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
52
    0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
53
    0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
54
    0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
55
    0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
56
    0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
57
    0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
58
    0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
59
    0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
60
    0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
61
    0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
62
    0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
63
    0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
64
    0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
65
    0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
66
    0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
67
    0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
68
    0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
69
    0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
70
};
71
72
static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
73
    0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
74
    0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
75
    0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
76
    0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
77
    0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
78
    0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
79
    0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
80
    0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
81
    0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
82
    0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
83
    0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
84
    0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
85
    0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
86
    0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
87
    0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
88
    0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
89
    0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
90
    0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
91
    0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
92
    0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
93
    0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
94
    0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
95
    0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
96
    0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
97
    0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
98
    0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
99
    0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
100
    0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
101
    0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
102
    0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
103
    0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
104
    0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
105
    0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
106
    0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
107
    0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
108
    0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
109
    0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
110
    0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
111
    0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
112
    0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
113
    0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
114
    0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
115
    0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
116
};
117
118
struct index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
119
    const unsigned char *ptr;
120
    const struct source_info *src;
121
    unsigned int val;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
122
};
123
124
struct unpacked_index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
125
    struct index_entry entry;
126
    struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
127
};
128
129
struct delta_index {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
130
    unsigned long memsize; /* Total bytes pointed to by this index */
131
    const struct source_info *last_src; /* Information about the referenced source */
132
    unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
133
                               entry */
134
    unsigned int num_entries; /* The total number of entries in this index */
135
    struct index_entry *last_entry; /* Pointer to the last valid entry */
136
    struct index_entry *hash[];
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
137
};
138
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
139
static unsigned int
140
limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
141
                   unsigned int *hash_count, unsigned int hsize,
142
                   unsigned int entries)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
143
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
144
    struct unpacked_index_entry *entry;
145
    unsigned int i;
146
    /*
147
     * Determine a limit on the number of entries in the same hash
148
     * bucket.  This guards us against pathological data sets causing
149
     * really bad hash distribution with most entries in the same hash
150
     * bucket that would bring us to O(m*n) computing costs (m and n
151
     * corresponding to reference and target buffer sizes).
152
     *
153
     * Make sure none of the hash buckets has more entries than
154
     * we're willing to test.  Otherwise we cull the entry list
155
     * uniformly to still preserve a good repartition across
156
     * the reference buffer.
157
     */
158
    for (i = 0; i < hsize; i++) {
159
        int acc;
160
161
        if (hash_count[i] <= HASH_LIMIT)
162
            continue;
163
164
        /* We leave exactly HASH_LIMIT entries in the bucket */
165
        entries -= hash_count[i] - HASH_LIMIT;
166
167
        entry = hash[i];
168
        acc = 0;
169
170
        /*
171
         * Assume that this loop is gone through exactly
172
         * HASH_LIMIT times and is entered and left with
173
         * acc==0.  So the first statement in the loop
174
         * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
175
         * to the accumulator, and the inner loop consequently
176
         * is run (hash_count[i]-HASH_LIMIT) times, removing
177
         * one element from the list each time.  Since acc
178
         * balances out to 0 at the final run, the inner loop
179
         * body can't be left with entry==NULL.  So we indeed
180
         * encounter entry==NULL in the outer loop only.
181
         */
182
        do {
183
            acc += hash_count[i] - HASH_LIMIT;
184
            if (acc > 0) {
185
                struct unpacked_index_entry *keep = entry;
186
                do {
187
                    entry = entry->next;
188
                    acc -= HASH_LIMIT;
189
                } while (acc > 0);
190
                keep->next = entry->next;
191
            }
192
            entry = entry->next;
193
        } while (entry);
194
    }
195
    return entries;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
196
}
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
197
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
198
static struct delta_index *
199
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
200
                 unsigned int num_entries)
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
201
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
202
    unsigned int i, memsize;
203
    struct unpacked_index_entry *entry;
204
    struct delta_index *index;
205
    struct index_entry *packed_entry, **packed_hash;
206
    void *mem;
207
    /*
208
     * Now create the packed index in array form
209
     * rather than linked lists.
210
     */
211
    memsize = sizeof(*index)
212
        + sizeof(*packed_hash) * (hsize+1)
213
        + sizeof(*packed_entry) * num_entries;
214
    mem = malloc(memsize);
215
    if (!mem) {
216
        return NULL;
217
    }
218
219
    index = mem;
220
    index->memsize = memsize;
221
    index->hash_mask = hsize - 1;
222
    index->num_entries = num_entries;
223
224
    mem = index->hash;
225
    packed_hash = mem;
226
    mem = packed_hash + (hsize+1);
227
    packed_entry = mem;
228
229
    for (i = 0; i < hsize; i++) {
230
        /*
231
         * Coalesce all entries belonging to one linked list
232
         * into consecutive array entries.
233
         */
234
        packed_hash[i] = packed_entry;
235
        for (entry = hash[i]; entry; entry = entry->next)
236
            *packed_entry++ = entry->entry;
237
    }
238
239
    /* Sentinel value to indicate the length of the last hash bucket */
240
    packed_hash[hsize] = packed_entry;
241
242
    assert(packed_entry - (struct index_entry *)mem == num_entries);
243
    index->last_entry = (packed_entry - 1);
244
    return index;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
245
}
246
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
247
void include_entries_from_index(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
248
                                unsigned int *hash_count,
249
                                unsigned int hsize,
250
                                struct unpacked_index_entry *entry,
251
                                const struct delta_index *old)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
252
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
253
    unsigned int i, hmask, masked_val;
254
    struct index_entry *old_entry;
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
255
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
256
    hmask = hsize - 1;
257
    /* Iterate backwards through the existing entries
258
     * This is so that matches early in the file end up being the first pointer
259
     * in the linked list.
260
     * TODO: We know that all old entries are going to occur before the new
261
     *       entries, and that we are going to follow this with a limit and
262
     *       then pack step. There is probably a way we could optimize this
263
     *       step, so that we wouldn't have to copy everything into a new
264
     *       buffer, and then copy everything again into yet another buffer.
265
     */
266
    for (old_entry = old->last_entry, i = 0; i < old->num_entries;
267
         i++, old_entry--) {
268
        masked_val = old_entry->val & hmask;
269
        entry->entry = *old_entry;
270
        entry->next = hash[masked_val];
271
        hash[masked_val] = entry++;
272
        hash_count[masked_val]++;
273
    }
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
274
}
275
0.23.49 by John Arbash Meinel
When adding new entries to the delta index, use memcpy
276
struct delta_index *
277
create_index_from_old_and_hash(const struct delta_index *old,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
278
                               struct unpacked_index_entry **hash,
279
                               unsigned int hsize,
280
                               unsigned int num_entries)
0.23.49 by John Arbash Meinel
When adding new entries to the delta index, use memcpy
281
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
282
    unsigned int i, memsize, total_num_entries;
283
    struct unpacked_index_entry *entry;
284
    struct delta_index *index;
285
    struct index_entry *packed_entry, **packed_hash;
286
    struct index_entry *copy_from_start, *copy_to_start;
287
    size_t total_copy, to_copy;
288
    unsigned int num_ops;
289
    unsigned int bytes_copied;
290
    void *mem;
291
    /*
292
     * Now create the packed index in array form
293
     * rather than linked lists.
294
     */
295
    total_num_entries = num_entries + old->num_entries;
296
    memsize = sizeof(*index)
297
        + sizeof(*packed_hash) * (hsize+1)
298
        + sizeof(*packed_entry) * total_num_entries;
299
    mem = malloc(memsize);
300
    if (!mem) {
301
        return NULL;
302
    }
303
304
    index = mem;
305
    index->memsize = memsize;
306
    index->hash_mask = hsize - 1;
307
    index->num_entries = total_num_entries;
308
309
    mem = index->hash;
310
    packed_hash = mem;
311
    mem = packed_hash + (hsize+1);
312
    packed_entry = mem;
313
314
    total_copy = 0;
315
    bytes_copied = 0;
316
    num_ops = 0;
317
    copy_from_start = copy_to_start = NULL;
318
    for (i = 0; i < hsize; i++) {
319
        /*
320
         * Coalesce all entries belonging to one linked list
321
         * into consecutive array entries.
322
         * The entries in old all come before the entries in hash.
323
         */
324
        packed_hash[i] = packed_entry;
325
        to_copy = (old->hash[i+1] - old->hash[i]);
326
        if (to_copy > 0) {
327
            /* This next range can be copied wholesale. However, we will wait
328
             * until we have to insert something from the new data before we
329
             * copy everything.
330
             * So for now, just reserve space for the copy.
331
             */
332
            if (total_copy == 0) {
333
                copy_from_start = old->hash[i];
334
                copy_to_start = packed_entry;
335
            }
336
            packed_entry += to_copy;
337
            total_copy += to_copy;
338
            assert((packed_entry - copy_to_start) == total_copy);
339
            assert((old->hash[i+1] - copy_from_start) == total_copy);
340
        }
341
        for (entry = hash[i]; entry; entry = entry->next) {
342
            /* We have an entry to insert, so flush the copy buffer */
343
            if (total_copy > 0) {
344
                assert(copy_to_start != NULL);
345
                assert(copy_from_start != NULL);
346
                memcpy(copy_to_start, copy_from_start,
347
                       total_copy * sizeof(struct index_entry));
348
                bytes_copied += (total_copy * sizeof(struct index_entry));
349
                total_copy = 0;
350
                copy_from_start = copy_to_start = NULL;
351
                num_ops += 1;
352
            }
353
            *packed_entry++ = entry->entry;
354
        }
355
    }
356
    if (total_copy > 0) {
357
        assert(copy_to_start != NULL);
358
        assert(copy_from_start != NULL);
359
        memcpy(copy_to_start, copy_from_start,
360
               total_copy * sizeof(struct index_entry));
361
        bytes_copied += (total_copy * sizeof(struct index_entry));
362
        num_ops += 1;
363
    }
364
365
    /* Sentinel value to indicate the length of the last hash bucket */
366
    packed_hash[hsize] = packed_entry;
367
368
    assert(packed_entry - (struct index_entry *)mem == total_num_entries);
369
    index->last_entry = (packed_entry - 1);
370
    return index;
0.23.49 by John Arbash Meinel
When adding new entries to the delta index, use memcpy
371
}
372
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
373
374
struct delta_index * create_delta_index(const struct source_info *src,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
375
                                        const struct delta_index *old)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
376
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
377
    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
378
    unsigned int total_num_entries;
379
    const unsigned char *data, *buffer;
380
    struct delta_index *index;
381
    struct unpacked_index_entry *entry, **hash;
382
    void *mem;
383
    unsigned long memsize;
384
385
    if (!src->buf || !src->size)
386
        return NULL;
387
    buffer = src->buf;
388
389
    /* Determine index hash size.  Note that indexing skips the
390
       first byte to allow for optimizing the Rabin's polynomial
391
       initialization in create_delta(). */
392
    num_entries = (src->size - 1)  / RABIN_WINDOW;
393
    if (old != NULL)
394
        total_num_entries = num_entries + old->num_entries;
395
    else
396
        total_num_entries = num_entries;
397
    hsize = total_num_entries / 4;
398
    for (i = 4; (1u << i) < hsize && i < 31; i++);
399
    hsize = 1 << i;
400
    hmask = hsize - 1;
401
402
    /* allocate lookup index */
403
    memsize = sizeof(*hash) * hsize +
404
          sizeof(*entry) * total_num_entries;
405
    mem = malloc(memsize);
406
    if (!mem)
407
        return NULL;
408
    hash = mem;
409
    mem = hash + hsize;
410
    entry = mem;
411
412
    memset(hash, 0, hsize * sizeof(*hash));
413
414
    /* allocate an array to count hash num_entries */
415
    hash_count = calloc(hsize, sizeof(*hash_count));
416
    if (!hash_count) {
417
        free(hash);
418
        return NULL;
419
    }
420
421
    /* then populate the index for the new data */
422
    prev_val = ~0;
423
    for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
424
         data >= buffer;
425
         data -= RABIN_WINDOW) {
426
        unsigned int val = 0;
427
        for (i = 1; i <= RABIN_WINDOW; i++)
428
            val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
429
        if (val == prev_val) {
430
            /* keep the lowest of consecutive identical blocks */
431
            entry[-1].entry.ptr = data + RABIN_WINDOW;
432
            --num_entries;
433
            --total_num_entries;
434
        } else {
435
            prev_val = val;
436
            i = val & hmask;
437
            entry->entry.ptr = data + RABIN_WINDOW;
438
            entry->entry.val = val;
439
            entry->entry.src = src;
440
            entry->next = hash[i];
441
            hash[i] = entry++;
442
            hash_count[i]++;
443
        }
444
    }
445
    /* If we have an existing delta_index, we want to bring its info into the
446
     * new structure. We assume that the existing structure's objects all occur
447
     * before the new source, so they get first precedence in the index.
448
     */
449
    if (old != NULL)
450
        include_entries_from_index(hash, hash_count, hsize, entry, old);
451
452
    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
453
                                           total_num_entries);
454
    free(hash_count);
455
    index = pack_delta_index(hash, hsize, total_num_entries);
456
    index->last_src = src;
457
    free(hash);
458
    if (!index) {
459
        return NULL;
460
    }
461
    return index;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
462
}
463
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
464
struct delta_index *
465
create_delta_index_from_delta(const struct source_info *src,
466
                              const struct delta_index *old)
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
467
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
468
    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
469
    unsigned int total_num_entries;
470
    const unsigned char *data, *buffer, *top;
471
    unsigned char cmd;
472
    struct delta_index *index;
473
    struct unpacked_index_entry *entry, **hash;
474
    void *mem;
475
    unsigned long memsize;
476
477
    if (!src->buf || !src->size)
478
        return NULL;
479
    buffer = src->buf;
480
    top = buffer + src->size;
481
482
    /* Determine index hash size.  Note that indexing skips the
483
       first byte to allow for optimizing the Rabin's polynomial
484
       initialization in create_delta().
485
       This computes the maximum number of entries that could be held. The
486
       actual number will be recomputed during processing.
487
       */
488
     
489
    num_entries = (src->size - 1)  / RABIN_WINDOW;
490
    if (old != NULL)
491
        total_num_entries = num_entries + old->num_entries;
492
    else
493
        total_num_entries = num_entries;
494
    hsize = total_num_entries / 4;
495
    for (i = 4; (1u << i) < hsize && i < 31; i++);
496
    hsize = 1 << i;
497
    hmask = hsize - 1;
498
499
    /* allocate lookup index */
500
    memsize = sizeof(*hash) * hsize +
501
          sizeof(*entry) * total_num_entries;
502
    mem = malloc(memsize);
503
    if (!mem)
504
        return NULL;
505
    hash = mem;
506
    mem = hash + hsize;
507
    entry = mem;
508
509
    memset(hash, 0, hsize * sizeof(*hash));
510
511
    /* allocate an array to count hash num_entries */
512
    hash_count = calloc(hsize, sizeof(*hash_count));
513
    if (!hash_count) {
514
        free(hash);
515
        return NULL;
516
    }
517
518
    /* then populate the index for the new data */
519
    /* The create_delta_index code starts at the end and works backward,
520
     * because it is easier to update the entry pointers (you always update the
521
     * head, rather than updating the tail).
522
     * However, we can't walk the delta structure that way.
523
     */
524
    prev_val = ~0;
525
    data = buffer;
526
    /* source size */
527
    get_delta_hdr_size(&data, top);
528
    /* target size */
529
    get_delta_hdr_size(&data, top);
530
    num_entries = 0; /* calculate the real number of entries */
531
    while (data < top) {
532
        cmd = *data++;
533
        if (cmd & 0x80) {
534
            /* Copy instruction, skip it */
535
            if (cmd & 0x01) data++;
536
            if (cmd & 0x02) data++;
537
            if (cmd & 0x04) data++;
538
            if (cmd & 0x08) data++;
539
            if (cmd & 0x10) data++;
540
            if (cmd & 0x20) data++;
541
            if (cmd & 0x40) data++;
542
        } else if (cmd) {
543
            /* Insert instruction, we want to index these bytes */
544
            if (data + cmd > top) {
545
                /* Invalid insert, not enough bytes in the delta */
546
                break;
547
            }
548
            for (; cmd > RABIN_WINDOW; cmd -= RABIN_WINDOW,
549
                                       data += RABIN_WINDOW) {
550
                unsigned int val = 0;
551
                for (i = 1; i <= RABIN_WINDOW; i++)
552
                    val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
553
                if (val != prev_val) {
554
                    /* Only keep the first of consecutive data */
555
                    prev_val = val;
556
                    i = val & hmask;
557
                    entry->entry.ptr = data + RABIN_WINDOW;
558
                    entry->entry.val = val;
559
                    entry->entry.src = src;
560
                    entry->next = NULL;
561
                    /* Now we have to insert this entry at the end of the hash
562
                     * map
563
                     */
564
                    if (hash[i] == NULL) {
565
                        hash[i] = entry;
566
                    } else {
567
                        struct unpacked_index_entry *this;
568
                        for (this = hash[i];
569
                            this->next != NULL;
570
                            this = this->next) { /* No action */ }
571
                        this->next = entry;
572
                        this = entry;
573
                    }
574
                    hash_count[i]++;
575
                    entry++;
576
                    num_entries++;
577
                }
578
            }
579
            /* Move the data pointer by whatever remainder is left */
580
            data += cmd;
581
        } else {
582
            /*
583
             * cmd == 0 is reserved for future encoding
584
             * extensions. In the mean time we must fail when
585
             * encountering them (might be data corruption).
586
             */
587
            break;
588
        }
589
    }
590
    if (data != top) {
591
        /* Something was wrong with this delta */
592
        free(hash);
593
        free(hash_count);
594
        return NULL;
595
    }
596
    if (num_entries == 0) {
597
        /** Nothing to index **/
598
        free(hash);
599
        free(hash_count);
600
        return NULL;
601
    }
602
    if (old != NULL) {
603
        if (hmask == old->hash_mask) {
604
            /* The total hash table size didn't change, which means that
605
             * entries will end up in the same pages. We can do bulk-copying to
606
             * get the final output
607
             */
608
            index = create_index_from_old_and_hash(old, hash, hsize,
609
                                                   num_entries);
610
            free(hash_count);
611
            free(hash);
612
            if (!index) {
613
                return NULL;
614
            }
615
            index->last_src = src;
616
            return index;
617
        } else {
618
            total_num_entries = num_entries + old->num_entries;
619
            include_entries_from_index(hash, hash_count, hsize, entry, old);
620
        }
621
    } else {
622
        total_num_entries = num_entries;
623
    }
624
625
    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
626
                                           total_num_entries);
627
    free(hash_count);
628
    index = pack_delta_index(hash, hsize, total_num_entries);
629
    free(hash);
630
    if (!index) {
631
        return NULL;
632
    }
633
    index->last_src = src;
634
    return index;
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
635
}
636
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
637
void free_delta_index(struct delta_index *index)
638
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
639
    free(index);
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
640
}
641
642
unsigned long sizeof_delta_index(struct delta_index *index)
643
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
644
    if (index)
645
        return index->memsize;
646
    else
647
        return 0;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
648
}
649
650
/*
651
 * The maximum size for any opcode sequence, including the initial header
652
 * plus Rabin window plus biggest copy.
653
 */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
654
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
655
656
void *
0.23.44 by John Arbash Meinel
Remove the multi-index handling now that we have index combining instead.
657
create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
658
             const void *trg_buf, unsigned long trg_size,
659
             unsigned long *delta_size, unsigned long max_size)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
660
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
661
    unsigned int i, outpos, outsize, moff, msize, val;
662
    const struct source_info *msource;
663
    int inscnt;
664
    const unsigned char *ref_data, *ref_top, *data, *top;
665
    unsigned char *out;
666
    unsigned long source_size;
667
668
    if (!trg_buf || !trg_size)
669
        return NULL;
670
    if (index == NULL)
671
        return NULL;
672
673
    outpos = 0;
674
    outsize = 8192;
675
    if (max_size && outsize >= max_size)
676
        outsize = max_size + MAX_OP_SIZE + 1;
677
    out = malloc(outsize);
678
    if (!out)
679
        return NULL;
680
681
    /* store reference buffer size */
682
    source_size = index->last_src->size + index->last_src->agg_offset;
683
    i = source_size;
684
    while (i >= 0x80) {
685
        out[outpos++] = i | 0x80;
686
        i >>= 7;
687
    }
688
    out[outpos++] = i;
689
690
    /* store target buffer size */
691
    i = trg_size;
692
    while (i >= 0x80) {
693
        out[outpos++] = i | 0x80;
694
        i >>= 7;
695
    }
696
    out[outpos++] = i;
697
698
    data = trg_buf;
699
    top = (const unsigned char *) trg_buf + trg_size;
700
701
    /* Start the matching by filling out with a simple 'insert' instruction, of
702
     * the first RABIN_WINDOW bytes of the input.
703
     */
704
    outpos++; /* leave a byte for the insert command */
705
    val = 0;
706
    for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
707
        out[outpos++] = *data;
708
        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
709
    }
710
    /* we are now setup with an insert of 'i' bytes and val contains the RABIN
711
     * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
712
     * input.
713
     */
714
    inscnt = i;
715
716
    moff = 0;
717
    msize = 0;
718
    msource = NULL;
719
    while (data < top) {
720
        if (msize < 4096) {
721
            /* we don't have a 'worthy enough' match yet, so let's look for
722
             * one.
723
             */
724
            struct index_entry *entry;
725
            /* Shift the window by one byte. */
726
            val ^= U[data[-RABIN_WINDOW]];
727
            val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
728
            i = val & index->hash_mask;
729
            /* TODO: When using multiple indexes like this, the hash tables
730
             *       mapping val => index_entry become less efficient.
731
             *       You end up getting a lot more collisions in the hash,
732
             *       which doesn't actually lead to a entry->val match.
733
             */
734
            for (entry = index->hash[i]; entry < index->hash[i+1];
735
                 entry++) {
736
                const unsigned char *ref;
737
                const unsigned char *src;
738
                unsigned int ref_size;
739
                if (entry->val != val)
740
                    continue;
741
                ref = entry->ptr;
742
                src = data;
743
                ref_data = entry->src->buf;
744
                ref_top = ref_data + entry->src->size;
745
                ref_size = ref_top - ref;
746
                /* ref_size is the longest possible match that we could make
747
                 * here. If ref_size <= msize, then we know that we cannot
748
                 * match more bytes with this location that we have already
749
                 * matched.
750
                 */
751
                if (ref_size > top - src)
752
                    ref_size = top - src;
753
                if (ref_size <= msize)
754
                    break;
755
                /* See how many bytes actually match at this location. */
756
                while (ref_size-- && *src++ == *ref)
757
                    ref++;
758
                if (msize < ref - entry->ptr) {
759
                    /* this is our best match so far */
760
                    msize = ref - entry->ptr;
761
                    msource = entry->src;
762
                    moff = entry->ptr - ref_data;
763
                    if (msize >= 4096) /* good enough */
764
                        break;
765
                }
766
            }
767
        }
768
769
        if (msize < 4) {
770
            /* The best match right now is less than 4 bytes long. So just add
771
             * the current byte to the insert instruction. Increment the insert
772
             * counter, and copy the byte of data into the output buffer.
773
             */
774
            if (!inscnt)
775
                outpos++;
776
            out[outpos++] = *data++;
777
            inscnt++;
778
            if (inscnt == 0x7f) {
779
                /* We have a max length insert instruction, finalize it in the
780
                 * output.
781
                 */
782
                out[outpos - inscnt - 1] = inscnt;
783
                inscnt = 0;
784
            }
785
            msize = 0;
786
        } else {
787
            unsigned int left;
788
            unsigned char *op;
789
790
            if (inscnt) {
791
                ref_data = msource->buf;
792
                while (moff && ref_data[moff-1] == data[-1]) {
793
                    /* we can match one byte back */
794
                    msize++;
795
                    moff--;
796
                    data--;
797
                    outpos--;
798
                    if (--inscnt)
799
                        continue;
800
                    outpos--;  /* remove count slot */
801
                    inscnt--;  /* make it -1 */
802
                    break;
803
                }
804
                out[outpos - inscnt - 1] = inscnt;
805
                inscnt = 0;
806
            }
807
808
            /* A copy op is currently limited to 64KB (pack v2) */
809
            left = (msize < 0x10000) ? 0 : (msize - 0x10000);
810
            msize -= left;
811
812
            op = out + outpos++;
813
            i = 0x80;
814
815
            /* moff is the offset in the local structure, for encoding, we need
816
             * to push it into the global offset
817
             */
818
            assert(moff < msource->size);
819
            moff += msource->agg_offset;
820
            assert(moff + msize <= source_size);
821
            if (moff & 0x000000ff)
822
                out[outpos++] = moff >> 0,  i |= 0x01;
823
            if (moff & 0x0000ff00)
824
                out[outpos++] = moff >> 8,  i |= 0x02;
825
            if (moff & 0x00ff0000)
826
                out[outpos++] = moff >> 16, i |= 0x04;
827
            if (moff & 0xff000000)
828
                out[outpos++] = moff >> 24, i |= 0x08;
829
            /* Put it back into local coordinates, in case we have multiple
830
             * copies in a row.
831
             */
832
            moff -= msource->agg_offset;
833
834
            if (msize & 0x00ff)
835
                out[outpos++] = msize >> 0, i |= 0x10;
836
            if (msize & 0xff00)
837
                out[outpos++] = msize >> 8, i |= 0x20;
838
839
            *op = i;
840
841
            data += msize;
842
            moff += msize;
843
            msize = left;
844
845
            if (msize < 4096) {
846
                int j;
847
                val = 0;
848
                for (j = -RABIN_WINDOW; j < 0; j++)
849
                    val = ((val << 8) | data[j])
850
                          ^ T[val >> RABIN_SHIFT];
851
            }
852
        }
853
854
        if (outpos >= outsize - MAX_OP_SIZE) {
855
            void *tmp = out;
856
            outsize = outsize * 3 / 2;
857
            if (max_size && outsize >= max_size)
858
                outsize = max_size + MAX_OP_SIZE + 1;
859
            if (max_size && outpos > max_size)
860
                break;
861
            out = realloc(out, outsize);
862
            if (!out) {
863
                free(tmp);
864
                return NULL;
865
            }
866
        }
867
    }
868
869
    if (inscnt)
870
        out[outpos - inscnt - 1] = inscnt;
871
872
    if (max_size && outpos > max_size) {
873
        free(out);
874
        return NULL;
875
    }
876
877
    *delta_size = outpos;
878
    return out;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
879
}
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
880
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
881
/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel
Track down a memory leak in the refactored diff-delta.c code.
882
 */