2
* diff-delta.c: generate a delta between two buffers
4
* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
* http://www.xmailserver.org/xdiff-lib.html
7
* Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8
* Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
15
* NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
16
* has granted permission for use under 'version 2 or later' in private email
17
* to Robert Collins and Karl Fogel on the 6th April 2009.
27
/* maximum hash entry list for the same hash bucket */
30
#define RABIN_SHIFT 23
31
#define RABIN_WINDOW 16
33
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
34
* for more data. Tweaking this number above 4 doesn't seem to help much,
39
static const unsigned int T[256] = {
40
0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
41
0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
42
0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
43
0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
44
0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
45
0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
46
0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
47
0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
48
0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
49
0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
50
0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
51
0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
52
0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
53
0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
54
0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
55
0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
56
0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
57
0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
58
0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
59
0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
60
0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
61
0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
62
0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
63
0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
64
0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
65
0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
66
0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
67
0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
68
0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
69
0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
70
0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
71
0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
72
0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
73
0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
74
0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
75
0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
76
0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
77
0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
78
0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
79
0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
80
0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
81
0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
82
0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
85
static const unsigned int U[256] = {
86
0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
87
0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
88
0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
89
0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
90
0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
91
0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
92
0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
93
0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
94
0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
95
0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
96
0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
97
0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
98
0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
99
0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
100
0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
101
0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
102
0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
103
0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
104
0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
105
0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
106
0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
107
0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
108
0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
109
0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
110
0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
111
0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
112
0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
113
0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
114
0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
115
0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
116
0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
117
0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
118
0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
119
0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
120
0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
121
0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
122
0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
123
0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
124
0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
125
0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
126
0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
127
0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
128
0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
132
const unsigned char *ptr;
133
const struct source_info *src;
137
struct index_entry_linked_list {
138
struct index_entry *p_entry;
139
struct index_entry_linked_list *next;
142
struct unpacked_index_entry {
143
struct index_entry entry;
144
struct unpacked_index_entry *next;
148
unsigned long memsize; /* Total bytes pointed to by this index */
149
const struct source_info *last_src; /* Information about the referenced source */
150
unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
152
unsigned int num_entries; /* The total number of entries in this index */
153
struct index_entry *last_entry; /* Pointer to the last valid entry */
154
struct index_entry *hash[];
158
limit_hash_buckets(struct unpacked_index_entry **hash,
159
unsigned int *hash_count, unsigned int hsize,
160
unsigned int entries)
162
struct unpacked_index_entry *entry;
165
* Determine a limit on the number of entries in the same hash
166
* bucket. This guards us against pathological data sets causing
167
* really bad hash distribution with most entries in the same hash
168
* bucket that would bring us to O(m*n) computing costs (m and n
169
* corresponding to reference and target buffer sizes).
171
* Make sure none of the hash buckets has more entries than
172
* we're willing to test. Otherwise we cull the entry list
173
* uniformly to still preserve a good repartition across
174
* the reference buffer.
176
for (i = 0; i < hsize; i++) {
179
if (hash_count[i] <= HASH_LIMIT)
182
/* We leave exactly HASH_LIMIT entries in the bucket */
183
entries -= hash_count[i] - HASH_LIMIT;
189
* Assume that this loop is gone through exactly
190
* HASH_LIMIT times and is entered and left with
191
* acc==0. So the first statement in the loop
192
* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
193
* to the accumulator, and the inner loop consequently
194
* is run (hash_count[i]-HASH_LIMIT) times, removing
195
* one element from the list each time. Since acc
196
* balances out to 0 at the final run, the inner loop
197
* body can't be left with entry==NULL. So we indeed
198
* encounter entry==NULL in the outer loop only.
201
acc += hash_count[i] - HASH_LIMIT;
203
struct unpacked_index_entry *keep = entry;
208
keep->next = entry->next;
216
static struct delta_index *
217
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
218
unsigned int num_entries, struct delta_index *old_index)
220
unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
221
struct unpacked_index_entry *entry;
222
struct delta_index *index;
223
struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
224
struct index_entry null_entry = {0};
230
// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
232
// num_entries - old_index->num_entries,
233
// old_index->num_entries, num_entries,
234
// old_index->hash_mask, hmask);
236
// fprintf(stderr, "Packing %d entries into a new index\n",
239
/* First, see if we can squeeze the new items into the existing structure.
243
if (old_index && old_index->hash_mask == hmask) {
245
for (i = 0; i < hsize; ++i) {
247
for (entry = hash[i]; entry; entry = entry->next) {
248
if (packed_entry == NULL) {
249
/* Find the last open spot */
250
packed_entry = old_index->hash[i + 1];
252
while (packed_entry >= old_index->hash[i]
253
&& packed_entry->ptr == NULL) {
258
if (packed_entry >= old_index->hash[i+1]
259
|| packed_entry->ptr != NULL) {
260
/* There are no free spots here :( */
264
/* We found an empty spot to put this entry
265
* Copy it over, and remove it from the linked list, just in
266
* case we end up running out of room later.
268
*packed_entry++ = entry->entry;
269
assert(entry == hash[i]);
270
hash[i] = entry->next;
272
old_index->num_entries++;
281
// fprintf(stderr, "Fit all %d entries into old index\n",
283
/* No need to allocate a new buffer */
286
// fprintf(stderr, "Fit only %d entries into old index,"
287
// " reallocating\n", copied_count);
291
* Now create the packed index in array form
292
* rather than linked lists.
293
* Leave a 2-entry gap for inserting more entries between the groups
295
memsize = sizeof(*index)
296
+ sizeof(*packed_hash) * (hsize+1)
297
+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
298
mem = malloc(memsize);
304
index->memsize = memsize;
305
index->hash_mask = hmask;
306
index->num_entries = num_entries;
308
if (hmask < old_index->hash_mask) {
309
fprintf(stderr, "hash mask was shrunk %x => %x\n",
310
old_index->hash_mask, hmask);
312
assert(hmask >= old_index->hash_mask);
317
mem = packed_hash + (hsize+1);
320
for (i = 0; i < hsize; i++) {
322
* Coalesce all entries belonging to one linked list
323
* into consecutive array entries.
325
packed_hash[i] = packed_entry;
326
/* Old comes earlier as a source, so it always comes first in a given
330
/* Could we optimize this to use memcpy when hmask ==
331
* old_index->hash_mask? Would it make any real difference?
333
j = i & old_index->hash_mask;
334
copy_from = old_index->hash[j];
335
for (old_entry = old_index->hash[j];
336
old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
338
if ((old_entry->val & hmask) == i) {
339
*packed_entry++ = *old_entry;
343
for (entry = hash[i]; entry; entry = entry->next) {
344
*packed_entry++ = entry->entry;
346
/* TODO: At this point packed_entry - packed_hash[i] is the number of
347
* records that we have inserted into this hash bucket.
348
* We should *really* consider doing some limiting along the
349
* lines of limit_hash_buckets() to avoid pathological behavior.
351
/* Now add extra 'NULL' entries that we can use for future expansion. */
352
for (j = 0; j < EXTRA_NULLS; ++j ) {
353
*packed_entry++ = null_entry;
357
/* Sentinel value to indicate the length of the last hash bucket */
358
packed_hash[hsize] = packed_entry;
360
if (packed_entry - (struct index_entry *)mem
361
!= num_entries + hsize*EXTRA_NULLS) {
362
fprintf(stderr, "We expected %d entries, but created %d\n",
363
num_entries + hsize*EXTRA_NULLS,
364
(int)(packed_entry - (struct index_entry*)mem));
366
assert(packed_entry - (struct index_entry *)mem
367
== num_entries + hsize*EXTRA_NULLS);
368
index->last_entry = (packed_entry - 1);
374
create_delta_index(const struct source_info *src,
375
struct delta_index *old)
377
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
378
unsigned int total_num_entries;
379
const unsigned char *data, *buffer;
380
struct delta_index *index;
381
struct unpacked_index_entry *entry, **hash;
383
unsigned long memsize;
385
if (!src->buf || !src->size)
389
/* Determine index hash size. Note that indexing skips the
390
first byte to allow for optimizing the Rabin's polynomial
391
initialization in create_delta(). */
392
num_entries = (src->size - 1) / RABIN_WINDOW;
394
total_num_entries = num_entries + old->num_entries;
396
total_num_entries = num_entries;
397
hsize = total_num_entries / 4;
398
for (i = 4; (1u << i) < hsize && i < 31; i++);
401
if (old && old->hash_mask > hmask) {
402
hmask = old->hash_mask;
406
/* allocate lookup index */
407
memsize = sizeof(*hash) * hsize +
408
sizeof(*entry) * total_num_entries;
409
mem = malloc(memsize);
416
memset(hash, 0, hsize * sizeof(*hash));
418
/* allocate an array to count hash num_entries */
419
hash_count = calloc(hsize, sizeof(*hash_count));
425
/* then populate the index for the new data */
427
for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
429
data -= RABIN_WINDOW) {
430
unsigned int val = 0;
431
for (i = 1; i <= RABIN_WINDOW; i++)
432
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
433
if (val == prev_val) {
434
/* keep the lowest of consecutive identical blocks */
435
entry[-1].entry.ptr = data + RABIN_WINDOW;
441
entry->entry.ptr = data + RABIN_WINDOW;
442
entry->entry.val = val;
443
entry->entry.src = src;
444
entry->next = hash[i];
449
/* TODO: It would be nice to limit_hash_buckets at a better time. */
450
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
456
index = pack_delta_index(hash, hsize, total_num_entries, old);
461
index->last_src = src;
465
/* Take some entries, and put them into a custom hash.
466
* @param entries A list of entries, sorted by position in file
467
* @param num_entries Length of entries
468
* @param out_hsize The maximum size of the hash, the final size will be
471
struct index_entry_linked_list **
472
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
475
unsigned int hash_offset, hmask, memsize;
476
struct index_entry *entry, *last_entry;
477
struct index_entry_linked_list *out_entry, **hash;
482
memsize = sizeof(*hash) * hsize +
483
sizeof(*out_entry) * num_entries;
484
mem = malloc(memsize);
491
memset(hash, 0, sizeof(*hash)*(hsize+1));
493
/* We know that entries are in the order we want in the output, but they
494
* aren't "grouped" by hash bucket yet.
496
last_entry = entries + num_entries;
497
for (entry = entries + num_entries - 1; entry >= entries; --entry) {
498
hash_offset = entry->val & hmask;
499
out_entry->p_entry = entry;
500
out_entry->next = hash[hash_offset];
501
/* TODO: Remove entries that have identical vals, or at least filter
502
* the map a little bit.
503
* if (hash[i] != NULL) {
506
hash[hash_offset] = out_entry;
514
create_index_from_old_and_new_entries(const struct delta_index *old_index,
515
struct index_entry *entries,
516
unsigned int num_entries)
518
unsigned int i, j, hsize, hmask, total_num_entries;
519
struct delta_index *index;
520
struct index_entry *entry, *packed_entry, **packed_hash;
521
struct index_entry *last_entry, null_entry = {0};
523
unsigned long memsize;
524
struct index_entry_linked_list *unpacked_entry, **mini_hash;
526
/* Determine index hash size. Note that indexing skips the
527
first byte to allow for optimizing the Rabin's polynomial
528
initialization in create_delta(). */
529
total_num_entries = num_entries + old_index->num_entries;
530
hsize = total_num_entries / 4;
531
for (i = 4; (1u << i) < hsize && i < 31; i++);
533
if (hsize < old_index->hash_mask) {
534
/* For some reason, there was a code path that would actually *shrink*
535
* the hash size. This screws with some later code, and in general, I
536
* think it better to make the hash bigger, rather than smaller. So
537
* we'll just force the size here.
538
* Possibly done by create_delta_index running into a
539
* limit_hash_buckets call, that ended up transitioning across a
540
* power-of-2. The cause isn't 100% clear, though.
542
hsize = old_index->hash_mask + 1;
545
// fprintf(stderr, "resizing index to insert %d entries into array"
546
// " with %d entries: %x => %x\n",
547
// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
549
memsize = sizeof(*index)
550
+ sizeof(*packed_hash) * (hsize+1)
551
+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
552
mem = malloc(memsize);
557
index->memsize = memsize;
558
index->hash_mask = hmask;
559
index->num_entries = total_num_entries;
560
index->last_src = old_index->last_src;
564
mem = packed_hash + (hsize+1);
567
mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
568
if (mini_hash == NULL) {
572
last_entry = entries + num_entries;
573
for (i = 0; i < hsize; i++) {
575
* Coalesce all entries belonging in one hash bucket
576
* into consecutive array entries.
577
* The entries in old_index all come before 'entries'.
579
packed_hash[i] = packed_entry;
580
/* Copy any of the old entries across */
581
/* Would we rather use memcpy? */
582
if (hmask == old_index->hash_mask) {
583
for (entry = old_index->hash[i];
584
entry < old_index->hash[i+1] && entry->ptr != NULL;
586
assert((entry->val & hmask) == i);
587
*packed_entry++ = *entry;
590
/* If we resized the index from this action, all of the old values
591
* will be found in the previous location, but they will end up
592
* spread across the new locations.
594
j = i & old_index->hash_mask;
595
for (entry = old_index->hash[j];
596
entry < old_index->hash[j+1] && entry->ptr != NULL;
598
assert((entry->val & old_index->hash_mask) == j);
599
if ((entry->val & hmask) == i) {
600
/* Any entries not picked up here will be picked up on the
603
*packed_entry++ = *entry;
607
/* Now see if we need to insert any of the new entries.
608
* Note that loop ends up O(hsize*num_entries), so we expect that
609
* num_entries is always small.
610
* We also help a little bit by collapsing the entry range when the
611
* endpoints are inserted. However, an alternative would be to build a
612
* quick hash lookup for just the new entries.
613
* Testing shows that this list can easily get up to about 100
614
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
615
* them into a sorted buffer, and a free() when done,
617
for (unpacked_entry = mini_hash[i];
619
unpacked_entry = unpacked_entry->next) {
620
assert((unpacked_entry->p_entry->val & hmask) == i);
621
*packed_entry++ = *(unpacked_entry->p_entry);
623
/* Now insert some extra nulls */
624
for (j = 0; j < EXTRA_NULLS; ++j) {
625
*packed_entry++ = null_entry;
630
/* Sentinel value to indicate the length of the last hash bucket */
631
packed_hash[hsize] = packed_entry;
633
if ((packed_entry - (struct index_entry *)mem)
634
!= (total_num_entries + hsize*EXTRA_NULLS)) {
635
fprintf(stderr, "We expected %d entries, but created %d\n",
636
total_num_entries + hsize*EXTRA_NULLS,
637
(int)(packed_entry - (struct index_entry*)mem));
640
assert((packed_entry - (struct index_entry *)mem)
641
== (total_num_entries + hsize * EXTRA_NULLS));
642
index->last_entry = (packed_entry - 1);
648
get_text(char buff[128], const unsigned char *ptr)
651
const unsigned char *start;
653
start = (ptr-RABIN_WINDOW-1);
655
if (cmd < 0x80) {// This is likely to be an insert instruction
656
if (cmd < RABIN_WINDOW) {
660
/* This was either a copy [should never be] or it
661
* was a longer insert so the insert start happened at 16 more
664
cmd = RABIN_WINDOW + 1;
667
cmd = 60; /* Be friendly to 80char terms */
669
/* Copy the 1 byte command, and 4 bytes after the insert */
671
memcpy(buff, start, cmd);
673
for (i = 0; i < cmd; ++i) {
674
if (buff[i] == '\n') {
676
} else if (buff[i] == '\t') {
683
create_delta_index_from_delta(const struct source_info *src,
684
struct delta_index *old_index)
686
unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
687
unsigned int hash_offset;
688
const unsigned char *data, *buffer, *top;
690
struct delta_index *new_index;
691
struct index_entry *entry, *entries, *old_entry;
693
if (!src->buf || !src->size)
696
top = buffer + src->size;
698
/* Determine index hash size. Note that indexing skips the
699
first byte to allow for optimizing the Rabin's polynomial
700
initialization in create_delta().
701
This computes the maximum number of entries that could be held. The
702
actual number will be recomputed during processing.
705
max_num_entries = (src->size - 1) / RABIN_WINDOW;
707
/* allocate an array to hold whatever entries we find */
708
entries = malloc(sizeof(*entry) * max_num_entries);
709
if (!entries) /* malloc failure */
712
/* then populate the index for the new data */
716
get_delta_hdr_size(&data, top);
717
entry = entries; /* start at the first slot */
718
num_entries = 0; /* calculate the real number of entries */
722
/* Copy instruction, skip it */
723
if (cmd & 0x01) data++;
724
if (cmd & 0x02) data++;
725
if (cmd & 0x04) data++;
726
if (cmd & 0x08) data++;
727
if (cmd & 0x10) data++;
728
if (cmd & 0x20) data++;
729
if (cmd & 0x40) data++;
731
/* Insert instruction, we want to index these bytes */
732
if (data + cmd > top) {
733
/* Invalid insert, not enough bytes in the delta */
736
/* The create_delta code requires a match at least 4 characters
737
* (including only the last char of the RABIN_WINDOW) before it
738
* will consider it something worth copying rather than inserting.
739
* So we don't want to index anything that we know won't ever be a
742
for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
743
data += RABIN_WINDOW) {
744
unsigned int val = 0;
745
for (i = 1; i <= RABIN_WINDOW; i++)
746
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
747
if (val != prev_val) {
748
/* Only keep the first of consecutive data */
751
entry->ptr = data + RABIN_WINDOW;
755
if (num_entries > max_num_entries) {
756
/* We ran out of entry room, something is really wrong
762
/* Move the data pointer by whatever remainder is left */
766
* cmd == 0 is reserved for future encoding
767
* extensions. In the mean time we must fail when
768
* encountering them (might be data corruption).
774
/* Something was wrong with this delta */
778
if (num_entries == 0) {
779
/** Nothing to index **/
783
assert(old_index != NULL);
784
old_index->last_src = src;
785
/* See if we can fill in these values into the holes in the array */
788
for (; num_entries > 0; --num_entries, ++entry) {
789
hash_offset = (entry->val & old_index->hash_mask);
790
/* The basic structure is a hash => packed_entries that fit in that
791
* hash bucket. Things are structured such that the hash-pointers are
792
* strictly ordered. So we start by pointing to the next pointer, and
793
* walk back until we stop getting NULL targets, and then go back
794
* forward. If there are no NULL targets, then we know because
795
* entry->ptr will not be NULL.
797
old_entry = old_index->hash[hash_offset + 1];
799
while (old_entry->ptr == NULL
800
&& old_entry >= old_index->hash[hash_offset]) {
804
if (old_entry->ptr != NULL
805
|| old_entry >= old_index->hash[hash_offset + 1]) {
806
/* There is no room for this entry, we have to resize */
808
// get_text(buff, entry->ptr);
809
// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
810
// hash_offset, entry->val, buff);
811
// for (old_entry = old_index->hash[hash_offset];
812
// old_entry < old_index->hash[hash_offset+1];
814
// get_text(buff, old_entry->ptr);
815
// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
816
// (int)(old_entry - old_index->hash[hash_offset]),
817
// old_entry->val, old_entry->ptr, buff);
823
/* For entries which we *do* manage to insert into old_index, we don't
824
* want them double copied into the final output.
826
old_index->num_entries++;
828
if (num_entries > 0) {
829
/* We couldn't fit the new entries into the old index, so allocate a
830
* new one, and fill it with stuff.
832
// fprintf(stderr, "inserted %d before resize\n", num_inserted);
833
new_index = create_index_from_old_and_new_entries(old_index,
837
// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
843
void free_delta_index(struct delta_index *index)
848
unsigned long sizeof_delta_index(struct delta_index *index)
851
return index->memsize;
857
* The maximum size for any opcode sequence, including the initial header
858
* plus Rabin window plus biggest copy.
860
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
863
create_delta(const struct delta_index *index,
864
const void *trg_buf, unsigned long trg_size,
865
unsigned long *delta_size, unsigned long max_size)
867
unsigned int i, outpos, outsize, moff, msize, val;
868
const struct source_info *msource;
870
const unsigned char *ref_data, *ref_top, *data, *top;
872
unsigned long source_size;
874
if (!trg_buf || !trg_size)
881
if (max_size && outsize >= max_size)
882
outsize = max_size + MAX_OP_SIZE + 1;
883
out = malloc(outsize);
887
source_size = index->last_src->size + index->last_src->agg_offset;
889
/* store target buffer size */
892
out[outpos++] = i | 0x80;
898
top = (const unsigned char *) trg_buf + trg_size;
900
/* Start the matching by filling out with a simple 'insert' instruction, of
901
* the first RABIN_WINDOW bytes of the input.
903
outpos++; /* leave a byte for the insert command */
905
for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
906
out[outpos++] = *data;
907
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
909
/* we are now setup with an insert of 'i' bytes and val contains the RABIN
910
* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
920
/* we don't have a 'worthy enough' match yet, so let's look for
923
struct index_entry *entry;
924
/* Shift the window by one byte. */
925
val ^= U[data[-RABIN_WINDOW]];
926
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
927
i = val & index->hash_mask;
928
/* TODO: When using multiple indexes like this, the hash tables
929
* mapping val => index_entry become less efficient.
930
* You end up getting a lot more collisions in the hash,
931
* which doesn't actually lead to a entry->val match.
933
for (entry = index->hash[i];
934
entry < index->hash[i+1] && entry->src != NULL;
936
const unsigned char *ref;
937
const unsigned char *src;
938
unsigned int ref_size;
939
if (entry->val != val)
943
ref_data = entry->src->buf;
944
ref_top = ref_data + entry->src->size;
945
ref_size = ref_top - ref;
946
/* ref_size is the longest possible match that we could make
947
* here. If ref_size <= msize, then we know that we cannot
948
* match more bytes with this location that we have already
951
if (ref_size > top - src)
952
ref_size = top - src;
953
if (ref_size <= msize)
955
/* See how many bytes actually match at this location. */
956
while (ref_size-- && *src++ == *ref)
958
if (msize < ref - entry->ptr) {
959
/* this is our best match so far */
960
msize = ref - entry->ptr;
961
msource = entry->src;
962
moff = entry->ptr - ref_data;
963
if (msize >= 4096) /* good enough */
970
/* The best match right now is less than 4 bytes long. So just add
971
* the current byte to the insert instruction. Increment the insert
972
* counter, and copy the byte of data into the output buffer.
976
out[outpos++] = *data++;
978
if (inscnt == 0x7f) {
979
/* We have a max length insert instruction, finalize it in the
982
out[outpos - inscnt - 1] = inscnt;
991
ref_data = msource->buf;
992
while (moff && ref_data[moff-1] == data[-1]) {
993
/* we can match one byte back */
1000
outpos--; /* remove count slot */
1001
inscnt--; /* make it -1 */
1004
out[outpos - inscnt - 1] = inscnt;
1008
/* A copy op is currently limited to 64KB (pack v2) */
1009
left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1012
op = out + outpos++;
1015
/* moff is the offset in the local structure, for encoding, we need
1016
* to push it into the global offset
1018
assert(moff < msource->size);
1019
moff += msource->agg_offset;
1020
assert(moff + msize <= source_size);
1021
if (moff & 0x000000ff)
1022
out[outpos++] = moff >> 0, i |= 0x01;
1023
if (moff & 0x0000ff00)
1024
out[outpos++] = moff >> 8, i |= 0x02;
1025
if (moff & 0x00ff0000)
1026
out[outpos++] = moff >> 16, i |= 0x04;
1027
if (moff & 0xff000000)
1028
out[outpos++] = moff >> 24, i |= 0x08;
1029
/* Put it back into local coordinates, in case we have multiple
1032
moff -= msource->agg_offset;
1035
out[outpos++] = msize >> 0, i |= 0x10;
1037
out[outpos++] = msize >> 8, i |= 0x20;
1048
for (j = -RABIN_WINDOW; j < 0; j++)
1049
val = ((val << 8) | data[j])
1050
^ T[val >> RABIN_SHIFT];
1054
if (outpos >= outsize - MAX_OP_SIZE) {
1056
outsize = outsize * 3 / 2;
1057
if (max_size && outsize >= max_size)
1058
outsize = max_size + MAX_OP_SIZE + 1;
1059
if (max_size && outpos > max_size)
1061
out = realloc(out, outsize);
1070
out[outpos - inscnt - 1] = inscnt;
1072
if (max_size && outpos > max_size) {
1077
*delta_size = outpos;
1081
/* vim: et ts=4 sw=4 sts=4