2
* diff-delta.c: generate a delta between two buffers
4
* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
* http://www.xmailserver.org/xdiff-lib.html
7
* Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
8
* Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
15
* NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
16
* has granted permission for use under 'version 2 or later' in private email
17
* to Robert Collins and Karl Fogel on the 6th April 2009.
27
/* maximum hash entry list for the same hash bucket */
30
#define RABIN_SHIFT 23
31
#define RABIN_WINDOW 16
33
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
34
* for more data. Tweaking this number above 4 doesn't seem to help much,
39
static const unsigned int T[256] = {
40
0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
41
0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
42
0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
43
0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
44
0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
45
0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
46
0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
47
0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
48
0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
49
0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
50
0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
51
0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
52
0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
53
0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
54
0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
55
0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
56
0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
57
0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
58
0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
59
0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
60
0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
61
0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
62
0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
63
0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
64
0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
65
0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
66
0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
67
0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
68
0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
69
0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
70
0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
71
0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
72
0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
73
0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
74
0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
75
0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
76
0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
77
0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
78
0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
79
0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
80
0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
81
0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
82
0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
85
static const unsigned int U[256] = {
86
0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
87
0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
88
0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
89
0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
90
0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
91
0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
92
0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
93
0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
94
0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
95
0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
96
0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
97
0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
98
0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
99
0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
100
0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
101
0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
102
0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
103
0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
104
0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
105
0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
106
0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
107
0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
108
0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
109
0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
110
0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
111
0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
112
0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
113
0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
114
0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
115
0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
116
0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
117
0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
118
0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
119
0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
120
0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
121
0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
122
0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
123
0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
124
0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
125
0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
126
0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
127
0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
128
0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
132
const unsigned char *ptr;
133
const struct source_info *src;
137
struct index_entry_linked_list {
138
struct index_entry *p_entry;
139
struct index_entry_linked_list *next;
142
struct unpacked_index_entry {
143
struct index_entry entry;
144
struct unpacked_index_entry *next;
148
unsigned long memsize; /* Total bytes pointed to by this index */
149
const struct source_info *last_src; /* Information about the referenced source */
150
unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
152
unsigned int num_entries; /* The total number of entries in this index */
153
struct index_entry *last_entry; /* Pointer to the last valid entry */
154
struct index_entry *hash[];
158
limit_hash_buckets(struct unpacked_index_entry **hash,
159
unsigned int *hash_count, unsigned int hsize,
160
unsigned int entries)
162
struct unpacked_index_entry *entry;
165
* Determine a limit on the number of entries in the same hash
166
* bucket. This guards us against pathological data sets causing
167
* really bad hash distribution with most entries in the same hash
168
* bucket that would bring us to O(m*n) computing costs (m and n
169
* corresponding to reference and target buffer sizes).
171
* Make sure none of the hash buckets has more entries than
172
* we're willing to test. Otherwise we cull the entry list
173
* uniformly to still preserve a good repartition across
174
* the reference buffer.
176
for (i = 0; i < hsize; i++) {
179
if (hash_count[i] <= HASH_LIMIT)
182
/* We leave exactly HASH_LIMIT entries in the bucket */
183
entries -= hash_count[i] - HASH_LIMIT;
189
* Assume that this loop is gone through exactly
190
* HASH_LIMIT times and is entered and left with
191
* acc==0. So the first statement in the loop
192
* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
193
* to the accumulator, and the inner loop consequently
194
* is run (hash_count[i]-HASH_LIMIT) times, removing
195
* one element from the list each time. Since acc
196
* balances out to 0 at the final run, the inner loop
197
* body can't be left with entry==NULL. So we indeed
198
* encounter entry==NULL in the outer loop only.
201
acc += hash_count[i] - HASH_LIMIT;
203
struct unpacked_index_entry *keep = entry;
208
keep->next = entry->next;
216
static struct delta_index *
217
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
218
unsigned int num_entries, struct delta_index *old_index)
220
unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
221
struct unpacked_index_entry *entry;
222
struct delta_index *index;
223
struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
224
struct index_entry null_entry = {0};
230
// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
232
// num_entries - old_index->num_entries,
233
// old_index->num_entries, num_entries,
234
// old_index->hash_mask, hmask);
236
// fprintf(stderr, "Packing %d entries into a new index\n",
239
/* First, see if we can squeeze the new items into the existing structure.
243
if (old_index && old_index->hash_mask == hmask) {
245
for (i = 0; i < hsize; ++i) {
247
for (entry = hash[i]; entry; entry = entry->next) {
248
if (packed_entry == NULL) {
249
/* Find the last open spot */
250
packed_entry = old_index->hash[i + 1];
252
while (packed_entry >= old_index->hash[i]
253
&& packed_entry->ptr == NULL) {
258
if (packed_entry >= old_index->hash[i+1]
259
|| packed_entry->ptr != NULL) {
260
/* There are no free spots here :( */
264
/* We found an empty spot to put this entry
265
* Copy it over, and remove it from the linked list, just in
266
* case we end up running out of room later.
268
*packed_entry++ = entry->entry;
269
assert(entry == hash[i]);
270
hash[i] = entry->next;
272
old_index->num_entries++;
281
// fprintf(stderr, "Fit all %d entries into old index\n",
284
* No need to allocate a new buffer, but return old_index ptr so
285
* callers can distinguish this from an OOM failure.
289
// fprintf(stderr, "Fit only %d entries into old index,"
290
// " reallocating\n", copied_count);
294
* Now create the packed index in array form
295
* rather than linked lists.
296
* Leave a 2-entry gap for inserting more entries between the groups
298
memsize = sizeof(*index)
299
+ sizeof(*packed_hash) * (hsize+1)
300
+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
301
mem = malloc(memsize);
307
index->memsize = memsize;
308
index->hash_mask = hmask;
309
index->num_entries = num_entries;
311
if (hmask < old_index->hash_mask) {
312
fprintf(stderr, "hash mask was shrunk %x => %x\n",
313
old_index->hash_mask, hmask);
315
assert(hmask >= old_index->hash_mask);
320
mem = packed_hash + (hsize+1);
323
for (i = 0; i < hsize; i++) {
325
* Coalesce all entries belonging to one linked list
326
* into consecutive array entries.
328
packed_hash[i] = packed_entry;
329
/* Old comes earlier as a source, so it always comes first in a given
333
/* Could we optimize this to use memcpy when hmask ==
334
* old_index->hash_mask? Would it make any real difference?
336
j = i & old_index->hash_mask;
337
copy_from = old_index->hash[j];
338
for (old_entry = old_index->hash[j];
339
old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
341
if ((old_entry->val & hmask) == i) {
342
*packed_entry++ = *old_entry;
346
for (entry = hash[i]; entry; entry = entry->next) {
347
*packed_entry++ = entry->entry;
349
/* TODO: At this point packed_entry - packed_hash[i] is the number of
350
* records that we have inserted into this hash bucket.
351
* We should *really* consider doing some limiting along the
352
* lines of limit_hash_buckets() to avoid pathological behavior.
354
/* Now add extra 'NULL' entries that we can use for future expansion. */
355
for (j = 0; j < EXTRA_NULLS; ++j ) {
356
*packed_entry++ = null_entry;
360
/* Sentinel value to indicate the length of the last hash bucket */
361
packed_hash[hsize] = packed_entry;
363
if (packed_entry - (struct index_entry *)mem
364
!= num_entries + hsize*EXTRA_NULLS) {
365
fprintf(stderr, "We expected %d entries, but created %d\n",
366
num_entries + hsize*EXTRA_NULLS,
367
(int)(packed_entry - (struct index_entry*)mem));
369
assert(packed_entry - (struct index_entry *)mem
370
== num_entries + hsize*EXTRA_NULLS);
371
index->last_entry = (packed_entry - 1);
377
create_delta_index(const struct source_info *src,
378
struct delta_index *old,
379
struct delta_index **fresh,
380
int max_bytes_to_index)
382
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
383
unsigned int total_num_entries, stride, max_entries;
384
const unsigned char *data, *buffer;
385
struct delta_index *index;
386
struct unpacked_index_entry *entry, **hash;
388
unsigned long memsize;
390
if (!src->buf || !src->size)
391
return DELTA_SOURCE_EMPTY;
394
/* Determine index hash size. Note that indexing skips the
395
first byte so we subtract 1 to get the edge cases right.
397
stride = RABIN_WINDOW;
398
num_entries = (src->size - 1) / RABIN_WINDOW;
399
if (max_bytes_to_index > 0) {
400
max_entries = (unsigned int) (max_bytes_to_index / RABIN_WINDOW);
401
if (num_entries > max_entries) {
402
/* Limit the max number of matching entries. This reduces the 'best'
403
* possible match, but means we don't consume all of ram.
405
num_entries = max_entries;
406
stride = (src->size - 1) / num_entries;
410
total_num_entries = num_entries + old->num_entries;
412
total_num_entries = num_entries;
413
hsize = total_num_entries / 4;
414
for (i = 4; (1u << i) < hsize && i < 31; i++);
417
if (old && old->hash_mask > hmask) {
418
hmask = old->hash_mask;
422
/* allocate lookup index */
423
memsize = sizeof(*hash) * hsize +
424
sizeof(*entry) * total_num_entries;
425
mem = malloc(memsize);
427
return DELTA_OUT_OF_MEMORY;
432
memset(hash, 0, hsize * sizeof(*hash));
434
/* allocate an array to count hash num_entries */
435
hash_count = calloc(hsize, sizeof(*hash_count));
438
return DELTA_OUT_OF_MEMORY;
441
/* then populate the index for the new data */
443
for (data = buffer + num_entries * stride - RABIN_WINDOW;
446
unsigned int val = 0;
447
for (i = 1; i <= RABIN_WINDOW; i++)
448
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
449
if (val == prev_val) {
450
/* keep the lowest of consecutive identical blocks */
451
entry[-1].entry.ptr = data + RABIN_WINDOW;
457
entry->entry.ptr = data + RABIN_WINDOW;
458
entry->entry.val = val;
459
entry->entry.src = src;
460
entry->next = hash[i];
465
/* TODO: It would be nice to limit_hash_buckets at a better time. */
466
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
469
index = pack_delta_index(hash, hsize, total_num_entries, old);
471
/* pack_delta_index only returns NULL on malloc failure */
473
return DELTA_OUT_OF_MEMORY;
475
index->last_src = src;
480
/* Take some entries, and put them into a custom hash.
481
* @param entries A list of entries, sorted by position in file
482
* @param num_entries Length of entries
483
* @param out_hsize The maximum size of the hash, the final size will be
486
struct index_entry_linked_list **
487
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
490
unsigned int hash_offset, hmask, memsize;
491
struct index_entry *entry;
492
struct index_entry_linked_list *out_entry, **hash;
497
memsize = sizeof(*hash) * hsize +
498
sizeof(*out_entry) * num_entries;
499
mem = malloc(memsize);
506
memset(hash, 0, sizeof(*hash)*(hsize+1));
508
/* We know that entries are in the order we want in the output, but they
509
* aren't "grouped" by hash bucket yet.
511
for (entry = entries + num_entries - 1; entry >= entries; --entry) {
512
hash_offset = entry->val & hmask;
513
out_entry->p_entry = entry;
514
out_entry->next = hash[hash_offset];
515
/* TODO: Remove entries that have identical vals, or at least filter
516
* the map a little bit.
517
* if (hash[i] != NULL) {
520
hash[hash_offset] = out_entry;
528
create_index_from_old_and_new_entries(const struct delta_index *old_index,
529
struct index_entry *entries,
530
unsigned int num_entries)
532
unsigned int i, j, hsize, hmask, total_num_entries;
533
struct delta_index *index;
534
struct index_entry *entry, *packed_entry, **packed_hash;
535
struct index_entry null_entry = {0};
537
unsigned long memsize;
538
struct index_entry_linked_list *unpacked_entry, **mini_hash;
540
/* Determine index hash size. Note that indexing skips the
541
first byte to allow for optimizing the Rabin's polynomial
542
initialization in create_delta(). */
543
total_num_entries = num_entries + old_index->num_entries;
544
hsize = total_num_entries / 4;
545
for (i = 4; (1u << i) < hsize && i < 31; i++);
547
if (hsize < old_index->hash_mask) {
548
/* For some reason, there was a code path that would actually *shrink*
549
* the hash size. This screws with some later code, and in general, I
550
* think it better to make the hash bigger, rather than smaller. So
551
* we'll just force the size here.
552
* Possibly done by create_delta_index running into a
553
* limit_hash_buckets call, that ended up transitioning across a
554
* power-of-2. The cause isn't 100% clear, though.
556
hsize = old_index->hash_mask + 1;
559
// fprintf(stderr, "resizing index to insert %d entries into array"
560
// " with %d entries: %x => %x\n",
561
// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
563
memsize = sizeof(*index)
564
+ sizeof(*packed_hash) * (hsize+1)
565
+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
566
mem = malloc(memsize);
571
index->memsize = memsize;
572
index->hash_mask = hmask;
573
index->num_entries = total_num_entries;
574
index->last_src = old_index->last_src;
578
mem = packed_hash + (hsize+1);
581
mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
582
if (mini_hash == NULL) {
586
for (i = 0; i < hsize; i++) {
588
* Coalesce all entries belonging in one hash bucket
589
* into consecutive array entries.
590
* The entries in old_index all come before 'entries'.
592
packed_hash[i] = packed_entry;
593
/* Copy any of the old entries across */
594
/* Would we rather use memcpy? */
595
if (hmask == old_index->hash_mask) {
596
for (entry = old_index->hash[i];
597
entry < old_index->hash[i+1] && entry->ptr != NULL;
599
assert((entry->val & hmask) == i);
600
*packed_entry++ = *entry;
603
/* If we resized the index from this action, all of the old values
604
* will be found in the previous location, but they will end up
605
* spread across the new locations.
607
j = i & old_index->hash_mask;
608
for (entry = old_index->hash[j];
609
entry < old_index->hash[j+1] && entry->ptr != NULL;
611
assert((entry->val & old_index->hash_mask) == j);
612
if ((entry->val & hmask) == i) {
613
/* Any entries not picked up here will be picked up on the
616
*packed_entry++ = *entry;
620
/* Now see if we need to insert any of the new entries.
621
* Note that loop ends up O(hsize*num_entries), so we expect that
622
* num_entries is always small.
623
* We also help a little bit by collapsing the entry range when the
624
* endpoints are inserted. However, an alternative would be to build a
625
* quick hash lookup for just the new entries.
626
* Testing shows that this list can easily get up to about 100
627
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
628
* them into a sorted buffer, and a free() when done,
630
for (unpacked_entry = mini_hash[i];
632
unpacked_entry = unpacked_entry->next) {
633
assert((unpacked_entry->p_entry->val & hmask) == i);
634
*packed_entry++ = *(unpacked_entry->p_entry);
636
/* Now insert some extra nulls */
637
for (j = 0; j < EXTRA_NULLS; ++j) {
638
*packed_entry++ = null_entry;
643
/* Sentinel value to indicate the length of the last hash bucket */
644
packed_hash[hsize] = packed_entry;
646
if ((packed_entry - (struct index_entry *)mem)
647
!= (total_num_entries + hsize*EXTRA_NULLS)) {
648
fprintf(stderr, "We expected %d entries, but created %d\n",
649
total_num_entries + hsize*EXTRA_NULLS,
650
(int)(packed_entry - (struct index_entry*)mem));
653
assert((packed_entry - (struct index_entry *)mem)
654
== (total_num_entries + hsize * EXTRA_NULLS));
655
index->last_entry = (packed_entry - 1);
661
get_text(char buff[128], const unsigned char *ptr)
664
const unsigned char *start;
666
start = (ptr-RABIN_WINDOW-1);
668
if (cmd < 0x80) {// This is likely to be an insert instruction
669
if (cmd < RABIN_WINDOW) {
673
/* This was either a copy [should never be] or it
674
* was a longer insert so the insert start happened at 16 more
677
cmd = RABIN_WINDOW + 1;
680
cmd = 60; /* Be friendly to 80char terms */
682
/* Copy the 1 byte command, and 4 bytes after the insert */
684
memcpy(buff, start, cmd);
686
for (i = 0; i < cmd; ++i) {
687
if (buff[i] == '\n') {
689
} else if (buff[i] == '\t') {
696
create_delta_index_from_delta(const struct source_info *src,
697
struct delta_index *old_index,
698
struct delta_index **fresh)
700
unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
701
unsigned int hash_offset;
702
const unsigned char *data, *buffer, *top;
704
struct delta_index *new_index;
705
struct index_entry *entry, *entries;
708
return DELTA_INDEX_NEEDED;
709
if (!src->buf || !src->size)
710
return DELTA_SOURCE_EMPTY;
712
top = buffer + src->size;
714
/* Determine index hash size. Note that indexing skips the
715
first byte to allow for optimizing the Rabin's polynomial
716
initialization in create_delta().
717
This computes the maximum number of entries that could be held. The
718
actual number will be recomputed during processing.
721
max_num_entries = (src->size - 1) / RABIN_WINDOW;
723
if (!max_num_entries) {
728
/* allocate an array to hold whatever entries we find */
729
entries = malloc(sizeof(*entry) * max_num_entries);
730
if (!entries) /* malloc failure */
731
return DELTA_OUT_OF_MEMORY;
733
/* then populate the index for the new data */
737
/* get_delta_hdr_size doesn't mutate the content, just moves the
738
* start-of-data pointer, so it is safe to do the cast.
740
get_delta_hdr_size((unsigned char**)&data, top);
741
entry = entries; /* start at the first slot */
742
num_entries = 0; /* calculate the real number of entries */
746
/* Copy instruction, skip it */
747
if (cmd & 0x01) data++;
748
if (cmd & 0x02) data++;
749
if (cmd & 0x04) data++;
750
if (cmd & 0x08) data++;
751
if (cmd & 0x10) data++;
752
if (cmd & 0x20) data++;
753
if (cmd & 0x40) data++;
755
/* Insert instruction, we want to index these bytes */
756
if (data + cmd > top) {
757
/* Invalid insert, not enough bytes in the delta */
760
/* The create_delta code requires a match at least 4 characters
761
* (including only the last char of the RABIN_WINDOW) before it
762
* will consider it something worth copying rather than inserting.
763
* So we don't want to index anything that we know won't ever be a
766
for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
767
data += RABIN_WINDOW) {
768
unsigned int val = 0;
769
for (i = 1; i <= RABIN_WINDOW; i++)
770
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
771
if (val != prev_val) {
772
/* Only keep the first of consecutive data */
775
entry->ptr = data + RABIN_WINDOW;
779
if (num_entries > max_num_entries) {
780
/* We ran out of entry room, something is really wrong
786
/* Move the data pointer by whatever remainder is left */
790
* cmd == 0 is reserved for future encoding
791
* extensions. In the mean time we must fail when
792
* encountering them (might be data corruption).
798
/* The source_info data passed was corrupted or otherwise invalid */
800
return DELTA_SOURCE_BAD;
802
if (num_entries == 0) {
803
/** Nothing to index **/
808
old_index->last_src = src;
809
/* See if we can fill in these values into the holes in the array */
812
for (; num_entries > 0; --num_entries, ++entry) {
813
struct index_entry *next_bucket_entry, *cur_entry, *bucket_first_entry;
814
hash_offset = (entry->val & old_index->hash_mask);
815
/* The basic structure is a hash => packed_entries that fit in that
816
* hash bucket. Things are structured such that the hash-pointers are
817
* strictly ordered. So we start by pointing to the next pointer, and
818
* walk back until we stop getting NULL targets, and then go back
819
* forward. If there are no NULL targets, then we know because
820
* entry->ptr will not be NULL.
822
// The start of the next bucket, this may point past the end of the
823
// entry table if hash_offset is the last bucket.
824
next_bucket_entry = old_index->hash[hash_offset + 1];
825
// First entry in this bucket
826
bucket_first_entry = old_index->hash[hash_offset];
827
cur_entry = next_bucket_entry - 1;
828
while (cur_entry->ptr == NULL && cur_entry >= bucket_first_entry) {
831
// cur_entry now either points at the first NULL, or it points to
832
// next_bucket_entry if there were no blank spots.
834
if (cur_entry >= next_bucket_entry || cur_entry->ptr != NULL) {
835
/* There is no room for this entry, we have to resize */
837
// get_text(buff, entry->ptr);
838
// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
839
// hash_offset, entry->val, buff);
840
// for (old_entry = old_index->hash[hash_offset];
841
// old_entry < old_index->hash[hash_offset+1];
843
// get_text(buff, old_entry->ptr);
844
// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
845
// (int)(old_entry - old_index->hash[hash_offset]),
846
// old_entry->val, old_entry->ptr, buff);
852
/* For entries which we *do* manage to insert into old_index, we don't
853
* want them double copied into the final output.
855
old_index->num_entries++;
857
if (num_entries > 0) {
858
/* We couldn't fit the new entries into the old index, so allocate a
859
* new one, and fill it with stuff.
861
// fprintf(stderr, "inserted %d before resize\n", num_inserted);
862
new_index = create_index_from_old_and_new_entries(old_index,
865
new_index = old_index;
866
// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
869
/* create_index_from_old_and_new_entries returns NULL on malloc failure */
871
return DELTA_OUT_OF_MEMORY;
876
void free_delta_index(struct delta_index *index)
882
sizeof_delta_index(struct delta_index *index)
885
return index->memsize;
891
* The maximum size for any opcode sequence, including the initial header
892
* plus Rabin window plus biggest copy.
894
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
897
create_delta(const struct delta_index *index,
898
const void *trg_buf, unsigned long trg_size,
899
unsigned long *delta_size, unsigned long max_size,
902
unsigned int i, outpos, outsize, moff, val;
904
const struct source_info *msource;
906
const unsigned char *ref_data, *ref_top, *data, *top;
908
unsigned long source_size;
910
if (!trg_buf || !trg_size)
911
return DELTA_BUFFER_EMPTY;
913
return DELTA_INDEX_NEEDED;
917
if (max_size && outsize >= max_size)
918
outsize = max_size + MAX_OP_SIZE + 1;
919
out = malloc(outsize);
921
return DELTA_OUT_OF_MEMORY;
923
source_size = index->last_src->size + index->last_src->agg_offset;
925
/* store target buffer size */
928
out[outpos++] = i | 0x80;
934
top = (const unsigned char *) trg_buf + trg_size;
936
/* Start the matching by filling out with a simple 'insert' instruction, of
937
* the first RABIN_WINDOW bytes of the input.
939
outpos++; /* leave a byte for the insert command */
941
for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
942
out[outpos++] = *data;
943
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
945
/* we are now setup with an insert of 'i' bytes and val contains the RABIN
946
* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
956
/* we don't have a 'worthy enough' match yet, so let's look for
959
struct index_entry *entry;
960
/* Shift the window by one byte. */
961
val ^= U[data[-RABIN_WINDOW]];
962
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
963
i = val & index->hash_mask;
964
/* TODO: When using multiple indexes like this, the hash tables
965
* mapping val => index_entry become less efficient.
966
* You end up getting a lot more collisions in the hash,
967
* which doesn't actually lead to a entry->val match.
969
for (entry = index->hash[i];
970
entry < index->hash[i+1] && entry->src != NULL;
972
const unsigned char *ref;
973
const unsigned char *src;
975
if (entry->val != val)
979
ref_data = entry->src->buf;
980
ref_top = ref_data + entry->src->size;
981
ref_size = ref_top - ref;
982
/* ref_size is the longest possible match that we could make
983
* here. If ref_size <= msize, then we know that we cannot
984
* match more bytes with this location that we have already
987
if (ref_size > (top - src))
988
ref_size = top - src;
989
if (ref_size <= msize)
991
/* See how many bytes actually match at this location. */
992
while (ref_size-- && *src++ == *ref)
994
if (msize < (ref - entry->ptr)) {
995
/* this is our best match so far */
996
msize = ref - entry->ptr;
997
msource = entry->src;
998
moff = entry->ptr - ref_data;
999
if (msize >= 4096) /* good enough */
1006
/* The best match right now is less than 4 bytes long. So just add
1007
* the current byte to the insert instruction. Increment the insert
1008
* counter, and copy the byte of data into the output buffer.
1012
out[outpos++] = *data++;
1014
if (inscnt == 0x7f) {
1015
/* We have a max length insert instruction, finalize it in the
1018
out[outpos - inscnt - 1] = inscnt;
1027
ref_data = msource->buf;
1028
while (moff && ref_data[moff-1] == data[-1]) {
1029
/* we can match one byte back */
1036
outpos--; /* remove count slot */
1037
inscnt--; /* make it -1 */
1040
out[outpos - inscnt - 1] = inscnt;
1044
/* A copy op is currently limited to 64KB (pack v2) */
1045
left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1048
op = out + outpos++;
1051
/* moff is the offset in the local structure, for encoding, we need
1052
* to push it into the global offset
1054
assert(moff < msource->size);
1055
moff += msource->agg_offset;
1056
assert(moff + msize <= source_size);
1057
if (moff & 0x000000ff)
1058
out[outpos++] = moff >> 0, i |= 0x01;
1059
if (moff & 0x0000ff00)
1060
out[outpos++] = moff >> 8, i |= 0x02;
1061
if (moff & 0x00ff0000)
1062
out[outpos++] = moff >> 16, i |= 0x04;
1063
if (moff & 0xff000000)
1064
out[outpos++] = moff >> 24, i |= 0x08;
1065
/* Put it back into local coordinates, in case we have multiple
1068
moff -= msource->agg_offset;
1071
out[outpos++] = msize >> 0, i |= 0x10;
1073
out[outpos++] = msize >> 8, i |= 0x20;
1084
for (j = -RABIN_WINDOW; j < 0; j++)
1085
val = ((val << 8) | data[j])
1086
^ T[val >> RABIN_SHIFT];
1090
if (outpos >= outsize - MAX_OP_SIZE) {
1092
outsize = outsize * 3 / 2;
1093
if (max_size && outsize >= max_size)
1094
outsize = max_size + MAX_OP_SIZE + 1;
1095
if (max_size && outpos > max_size)
1097
out = realloc(out, outsize);
1100
return DELTA_OUT_OF_MEMORY;
1106
out[outpos - inscnt - 1] = inscnt;
1108
if (max_size && outpos > max_size) {
1110
return DELTA_SIZE_TOO_BIG;
1113
*delta_size = outpos;
1120
get_entry_summary(const struct delta_index *index, int pos,
1121
unsigned int *text_offset, unsigned int *hash_val)
1124
const struct index_entry *entry;
1125
const struct index_entry *start_of_entries;
1126
unsigned int offset;
1127
if (pos < 0 || text_offset == NULL || hash_val == NULL
1132
hsize = index->hash_mask + 1;
1133
start_of_entries = (struct index_entry *)(((struct index_entry **)index->hash) + (hsize + 1));
1134
entry = start_of_entries + pos;
1135
if (entry > index->last_entry) {
1138
if (entry->ptr == NULL) {
1142
offset = entry->src->agg_offset;
1143
offset += (entry->ptr - ((unsigned char *)entry->src->buf));
1144
*text_offset = offset;
1145
*hash_val = entry->val;
1152
get_hash_offset(const struct delta_index *index, int pos,
1153
unsigned int *entry_offset)
1156
const struct index_entry *entry;
1157
const struct index_entry *start_of_entries;
1158
if (pos < 0 || index == NULL || entry_offset == NULL)
1162
hsize = index->hash_mask + 1;
1163
start_of_entries = (struct index_entry *)(((struct index_entry **)index->hash) + (hsize + 1));
1167
entry = index->hash[pos];
1168
if (entry == NULL) {
1171
*entry_offset = (entry - start_of_entries);
1178
rabin_hash(const unsigned char *data)
1181
unsigned int val = 0;
1182
for (i = 0; i < RABIN_WINDOW; i++)
1183
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
1187
/* vim: et ts=4 sw=4 sts=4