2
* diff-delta.c: generate a delta between two buffers
4
* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
* http://www.xmailserver.org/xdiff-lib.html
7
* Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8
* Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
10
* This code is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License version 2 as
12
* published by the Free Software Foundation.
22
/* maximum hash entry list for the same hash bucket */
25
#define RABIN_SHIFT 23
26
#define RABIN_WINDOW 16
28
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
29
* for more data. Tweaking this number above 4 doesn't seem to help much,
34
static const unsigned int T[256] = {
35
0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
36
0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
37
0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
38
0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
39
0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
40
0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
41
0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
42
0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
43
0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
44
0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
45
0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
46
0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
47
0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
48
0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
49
0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
50
0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
51
0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
52
0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
53
0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
54
0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
55
0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
56
0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
57
0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
58
0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
59
0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
60
0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
61
0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
62
0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
63
0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
64
0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
65
0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
66
0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
67
0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
68
0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
69
0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
70
0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
71
0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
72
0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
73
0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
74
0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
75
0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
76
0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
77
0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
80
static const unsigned int U[256] = {
81
0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
82
0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
83
0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
84
0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
85
0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
86
0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
87
0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
88
0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
89
0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
90
0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
91
0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
92
0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
93
0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
94
0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
95
0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
96
0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
97
0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
98
0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
99
0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
100
0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
101
0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
102
0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
103
0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
104
0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
105
0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
106
0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
107
0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
108
0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
109
0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
110
0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
111
0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
112
0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
113
0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
114
0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
115
0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
116
0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
117
0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
118
0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
119
0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
120
0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
121
0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
122
0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
123
0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
127
const unsigned char *ptr;
128
const struct source_info *src;
132
struct index_entry_linked_list {
133
struct index_entry *p_entry;
134
struct index_entry_linked_list *next;
137
struct unpacked_index_entry {
138
struct index_entry entry;
139
struct unpacked_index_entry *next;
143
unsigned long memsize; /* Total bytes pointed to by this index */
144
const struct source_info *last_src; /* Information about the referenced source */
145
unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
147
unsigned int num_entries; /* The total number of entries in this index */
148
struct index_entry *last_entry; /* Pointer to the last valid entry */
149
struct index_entry *hash[];
153
limit_hash_buckets(struct unpacked_index_entry **hash,
154
unsigned int *hash_count, unsigned int hsize,
155
unsigned int entries)
157
struct unpacked_index_entry *entry;
160
* Determine a limit on the number of entries in the same hash
161
* bucket. This guards us against pathological data sets causing
162
* really bad hash distribution with most entries in the same hash
163
* bucket that would bring us to O(m*n) computing costs (m and n
164
* corresponding to reference and target buffer sizes).
166
* Make sure none of the hash buckets has more entries than
167
* we're willing to test. Otherwise we cull the entry list
168
* uniformly to still preserve a good repartition across
169
* the reference buffer.
171
for (i = 0; i < hsize; i++) {
174
if (hash_count[i] <= HASH_LIMIT)
177
/* We leave exactly HASH_LIMIT entries in the bucket */
178
entries -= hash_count[i] - HASH_LIMIT;
184
* Assume that this loop is gone through exactly
185
* HASH_LIMIT times and is entered and left with
186
* acc==0. So the first statement in the loop
187
* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
188
* to the accumulator, and the inner loop consequently
189
* is run (hash_count[i]-HASH_LIMIT) times, removing
190
* one element from the list each time. Since acc
191
* balances out to 0 at the final run, the inner loop
192
* body can't be left with entry==NULL. So we indeed
193
* encounter entry==NULL in the outer loop only.
196
acc += hash_count[i] - HASH_LIMIT;
198
struct unpacked_index_entry *keep = entry;
203
keep->next = entry->next;
211
static struct delta_index *
212
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
213
unsigned int num_entries, struct delta_index *old_index)
215
unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
216
struct unpacked_index_entry *entry;
217
struct delta_index *index;
218
struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
219
struct index_entry null_entry = {0};
225
// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
227
// num_entries - old_index->num_entries,
228
// old_index->num_entries, num_entries,
229
// old_index->hash_mask, hmask);
231
// fprintf(stderr, "Packing %d entries into a new index\n",
234
/* First, see if we can squeeze the new items into the existing structure.
238
if (old_index && old_index->hash_mask == hmask) {
240
for (i = 0; i < hsize; ++i) {
242
for (entry = hash[i]; entry; entry = entry->next) {
243
if (packed_entry == NULL) {
244
/* Find the last open spot */
245
packed_entry = old_index->hash[i + 1];
247
while (packed_entry >= old_index->hash[i]
248
&& packed_entry->ptr == NULL) {
253
if (packed_entry >= old_index->hash[i+1]
254
|| packed_entry->ptr != NULL) {
255
/* There are no free spots here :( */
259
/* We found an empty spot to put this entry
260
* Copy it over, and remove it from the linked list, just in
261
* case we end up running out of room later.
263
*packed_entry++ = entry->entry;
264
assert(entry == hash[i]);
265
hash[i] = entry->next;
267
old_index->num_entries++;
276
// fprintf(stderr, "Fit all %d entries into old index\n",
278
/* No need to allocate a new buffer */
281
// fprintf(stderr, "Fit only %d entries into old index,"
282
// " reallocating\n", copied_count);
286
* Now create the packed index in array form
287
* rather than linked lists.
288
* Leave a 2-entry gap for inserting more entries between the groups
290
memsize = sizeof(*index)
291
+ sizeof(*packed_hash) * (hsize+1)
292
+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
293
mem = malloc(memsize);
299
index->memsize = memsize;
300
index->hash_mask = hmask;
301
index->num_entries = num_entries;
303
if (hmask < old_index->hash_mask) {
304
fprintf(stderr, "hash mask was shrunk %x => %x\n",
305
old_index->hash_mask, hmask);
307
assert(hmask >= old_index->hash_mask);
312
mem = packed_hash + (hsize+1);
315
for (i = 0; i < hsize; i++) {
317
* Coalesce all entries belonging to one linked list
318
* into consecutive array entries.
320
packed_hash[i] = packed_entry;
321
/* Old comes earlier as a source, so it always comes first in a given
325
/* Could we optimize this to use memcpy when hmask ==
326
* old_index->hash_mask? Would it make any real difference?
328
j = i & old_index->hash_mask;
329
copy_from = old_index->hash[j];
330
for (old_entry = old_index->hash[j];
331
old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
333
if ((old_entry->val & hmask) == i) {
334
*packed_entry++ = *old_entry;
338
for (entry = hash[i]; entry; entry = entry->next) {
339
*packed_entry++ = entry->entry;
341
/* TODO: At this point packed_entry - packed_hash[i] is the number of
342
* records that we have inserted into this hash bucket.
343
* We should *really* consider doing some limiting along the
344
* lines of limit_hash_buckets() to avoid pathological behavior.
346
/* Now add extra 'NULL' entries that we can use for future expansion. */
347
for (j = 0; j < EXTRA_NULLS; ++j ) {
348
*packed_entry++ = null_entry;
352
/* Sentinel value to indicate the length of the last hash bucket */
353
packed_hash[hsize] = packed_entry;
355
if (packed_entry - (struct index_entry *)mem
356
!= num_entries + hsize*EXTRA_NULLS) {
357
fprintf(stderr, "We expected %d entries, but created %d\n",
358
num_entries + hsize*EXTRA_NULLS,
359
(int)(packed_entry - (struct index_entry*)mem));
361
assert(packed_entry - (struct index_entry *)mem
362
== num_entries + hsize*EXTRA_NULLS);
363
index->last_entry = (packed_entry - 1);
369
create_delta_index(const struct source_info *src,
370
struct delta_index *old)
372
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
373
unsigned int total_num_entries;
374
const unsigned char *data, *buffer;
375
struct delta_index *index;
376
struct unpacked_index_entry *entry, **hash;
378
unsigned long memsize;
380
if (!src->buf || !src->size)
384
/* Determine index hash size. Note that indexing skips the
385
first byte to allow for optimizing the Rabin's polynomial
386
initialization in create_delta(). */
387
num_entries = (src->size - 1) / RABIN_WINDOW;
389
total_num_entries = num_entries + old->num_entries;
391
total_num_entries = num_entries;
392
hsize = total_num_entries / 4;
393
for (i = 4; (1u << i) < hsize && i < 31; i++);
396
if (old && old->hash_mask > hmask) {
397
hmask = old->hash_mask;
401
/* allocate lookup index */
402
memsize = sizeof(*hash) * hsize +
403
sizeof(*entry) * total_num_entries;
404
mem = malloc(memsize);
411
memset(hash, 0, hsize * sizeof(*hash));
413
/* allocate an array to count hash num_entries */
414
hash_count = calloc(hsize, sizeof(*hash_count));
420
/* then populate the index for the new data */
422
for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
424
data -= RABIN_WINDOW) {
425
unsigned int val = 0;
426
for (i = 1; i <= RABIN_WINDOW; i++)
427
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
428
if (val == prev_val) {
429
/* keep the lowest of consecutive identical blocks */
430
entry[-1].entry.ptr = data + RABIN_WINDOW;
436
entry->entry.ptr = data + RABIN_WINDOW;
437
entry->entry.val = val;
438
entry->entry.src = src;
439
entry->next = hash[i];
444
/* TODO: It would be nice to limit_hash_buckets at a better time. */
445
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
451
index = pack_delta_index(hash, hsize, total_num_entries, old);
456
index->last_src = src;
460
/* Take some entries, and put them into a custom hash.
461
* @param entries A list of entries, sorted by position in file
462
* @param num_entries Length of entries
463
* @param out_hsize The maximum size of the hash, the final size will be
466
struct index_entry_linked_list **
467
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
470
unsigned int hash_offset, hmask, memsize;
471
struct index_entry *entry, *last_entry;
472
struct index_entry_linked_list *out_entry, **hash;
477
memsize = sizeof(*hash) * hsize +
478
sizeof(*out_entry) * num_entries;
479
mem = malloc(memsize);
486
memset(hash, 0, sizeof(*hash)*(hsize+1));
488
/* We know that entries are in the order we want in the output, but they
489
* aren't "grouped" by hash bucket yet.
491
last_entry = entries + num_entries;
492
for (entry = entries + num_entries - 1; entry >= entries; --entry) {
493
hash_offset = entry->val & hmask;
494
out_entry->p_entry = entry;
495
out_entry->next = hash[hash_offset];
496
/* TODO: Remove entries that have identical vals, or at least filter
497
* the map a little bit.
498
* if (hash[i] != NULL) {
501
hash[hash_offset] = out_entry;
509
create_index_from_old_and_new_entries(const struct delta_index *old_index,
510
struct index_entry *entries,
511
unsigned int num_entries)
513
unsigned int i, j, hsize, hmask, total_num_entries;
514
struct delta_index *index;
515
struct index_entry *entry, *packed_entry, **packed_hash;
516
struct index_entry *last_entry, null_entry = {0};
518
unsigned long memsize;
519
struct index_entry_linked_list *unpacked_entry, **mini_hash;
521
/* Determine index hash size. Note that indexing skips the
522
first byte to allow for optimizing the Rabin's polynomial
523
initialization in create_delta(). */
524
total_num_entries = num_entries + old_index->num_entries;
525
hsize = total_num_entries / 4;
526
for (i = 4; (1u << i) < hsize && i < 31; i++);
528
if (hsize < old_index->hash_mask) {
529
/* For some reason, there was a code path that would actually *shrink*
530
* the hash size. This screws with some later code, and in general, I
531
* think it better to make the hash bigger, rather than smaller. So
532
* we'll just force the size here.
533
* Possibly done by create_delta_index running into a
534
* limit_hash_buckets call, that ended up transitioning across a
535
* power-of-2. The cause isn't 100% clear, though.
537
hsize = old_index->hash_mask + 1;
540
// fprintf(stderr, "resizing index to insert %d entries into array"
541
// " with %d entries: %x => %x\n",
542
// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
544
memsize = sizeof(*index)
545
+ sizeof(*packed_hash) * (hsize+1)
546
+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
547
mem = malloc(memsize);
552
index->memsize = memsize;
553
index->hash_mask = hmask;
554
index->num_entries = total_num_entries;
555
index->last_src = old_index->last_src;
559
mem = packed_hash + (hsize+1);
562
mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
563
if (mini_hash == NULL) {
567
last_entry = entries + num_entries;
568
for (i = 0; i < hsize; i++) {
570
* Coalesce all entries belonging in one hash bucket
571
* into consecutive array entries.
572
* The entries in old_index all come before 'entries'.
574
packed_hash[i] = packed_entry;
575
/* Copy any of the old entries across */
576
/* Would we rather use memcpy? */
577
if (hmask == old_index->hash_mask) {
578
for (entry = old_index->hash[i];
579
entry < old_index->hash[i+1] && entry->ptr != NULL;
581
assert((entry->val & hmask) == i);
582
*packed_entry++ = *entry;
585
/* If we resized the index from this action, all of the old values
586
* will be found in the previous location, but they will end up
587
* spread across the new locations.
589
j = i & old_index->hash_mask;
590
for (entry = old_index->hash[j];
591
entry < old_index->hash[j+1] && entry->ptr != NULL;
593
assert((entry->val & old_index->hash_mask) == j);
594
if ((entry->val & hmask) == i) {
595
/* Any entries not picked up here will be picked up on the
598
*packed_entry++ = *entry;
602
/* Now see if we need to insert any of the new entries.
603
* Note that loop ends up O(hsize*num_entries), so we expect that
604
* num_entries is always small.
605
* We also help a little bit by collapsing the entry range when the
606
* endpoints are inserted. However, an alternative would be to build a
607
* quick hash lookup for just the new entries.
608
* Testing shows that this list can easily get up to about 100
609
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
610
* them into a sorted buffer, and a free() when done,
612
for (unpacked_entry = mini_hash[i];
614
unpacked_entry = unpacked_entry->next) {
615
assert((unpacked_entry->p_entry->val & hmask) == i);
616
*packed_entry++ = *(unpacked_entry->p_entry);
618
/* Now insert some extra nulls */
619
for (j = 0; j < EXTRA_NULLS; ++j) {
620
*packed_entry++ = null_entry;
625
/* Sentinel value to indicate the length of the last hash bucket */
626
packed_hash[hsize] = packed_entry;
628
if ((packed_entry - (struct index_entry *)mem)
629
!= (total_num_entries + hsize*EXTRA_NULLS)) {
630
fprintf(stderr, "We expected %d entries, but created %d\n",
631
total_num_entries + hsize*EXTRA_NULLS,
632
(int)(packed_entry - (struct index_entry*)mem));
635
assert((packed_entry - (struct index_entry *)mem)
636
== (total_num_entries + hsize * EXTRA_NULLS));
637
index->last_entry = (packed_entry - 1);
643
get_text(char buff[128], const unsigned char *ptr)
646
const unsigned char *start;
648
start = (ptr-RABIN_WINDOW-1);
650
if (cmd < 0x80) {// This is likely to be an insert instruction
651
if (cmd < RABIN_WINDOW) {
655
/* This was either a copy [should never be] or it
656
* was a longer insert so the insert start happened at 16 more
659
cmd = RABIN_WINDOW + 1;
662
cmd = 60; /* Be friendly to 80char terms */
664
/* Copy the 1 byte command, and 4 bytes after the insert */
666
memcpy(buff, start, cmd);
668
for (i = 0; i < cmd; ++i) {
669
if (buff[i] == '\n') {
671
} else if (buff[i] == '\t') {
678
create_delta_index_from_delta(const struct source_info *src,
679
struct delta_index *old_index)
681
unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
682
unsigned int hash_offset;
683
const unsigned char *data, *buffer, *top;
685
struct delta_index *new_index;
686
struct index_entry *entry, *entries, *old_entry;
688
if (!src->buf || !src->size)
691
top = buffer + src->size;
693
/* Determine index hash size. Note that indexing skips the
694
first byte to allow for optimizing the Rabin's polynomial
695
initialization in create_delta().
696
This computes the maximum number of entries that could be held. The
697
actual number will be recomputed during processing.
700
max_num_entries = (src->size - 1) / RABIN_WINDOW;
702
/* allocate an array to hold whatever entries we find */
703
entries = malloc(sizeof(*entry) * max_num_entries);
704
if (!entries) /* malloc failure */
707
/* then populate the index for the new data */
711
get_delta_hdr_size(&data, top);
713
get_delta_hdr_size(&data, top);
714
entry = entries; /* start at the first slot */
715
num_entries = 0; /* calculate the real number of entries */
719
/* Copy instruction, skip it */
720
if (cmd & 0x01) data++;
721
if (cmd & 0x02) data++;
722
if (cmd & 0x04) data++;
723
if (cmd & 0x08) data++;
724
if (cmd & 0x10) data++;
725
if (cmd & 0x20) data++;
726
if (cmd & 0x40) data++;
728
/* Insert instruction, we want to index these bytes */
729
if (data + cmd > top) {
730
/* Invalid insert, not enough bytes in the delta */
733
/* The create_delta code requires a match at least 4 characters
734
* (including only the last char of the RABIN_WINDOW) before it
735
* will consider it something worth copying rather than inserting.
736
* So we don't want to index anything that we know won't ever be a
739
for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
740
data += RABIN_WINDOW) {
741
unsigned int val = 0;
742
for (i = 1; i <= RABIN_WINDOW; i++)
743
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
744
if (val != prev_val) {
745
/* Only keep the first of consecutive data */
748
entry->ptr = data + RABIN_WINDOW;
752
if (num_entries > max_num_entries) {
753
/* We ran out of entry room, something is really wrong
759
/* Move the data pointer by whatever remainder is left */
763
* cmd == 0 is reserved for future encoding
764
* extensions. In the mean time we must fail when
765
* encountering them (might be data corruption).
771
/* Something was wrong with this delta */
775
if (num_entries == 0) {
776
/** Nothing to index **/
780
assert(old_index != NULL);
781
old_index->last_src = src;
782
/* See if we can fill in these values into the holes in the array */
785
for (; num_entries > 0; --num_entries, ++entry) {
786
hash_offset = (entry->val & old_index->hash_mask);
787
/* The basic structure is a hash => packed_entries that fit in that
788
* hash bucket. Things are structured such that the hash-pointers are
789
* strictly ordered. So we start by pointing to the next pointer, and
790
* walk back until we stop getting NULL targets, and then go back
791
* forward. If there are no NULL targets, then we know because
792
* entry->ptr will not be NULL.
794
old_entry = old_index->hash[hash_offset + 1];
796
while (old_entry->ptr == NULL
797
&& old_entry >= old_index->hash[hash_offset]) {
801
if (old_entry->ptr != NULL
802
|| old_entry >= old_index->hash[hash_offset + 1]) {
803
/* There is no room for this entry, we have to resize */
805
// get_text(buff, entry->ptr);
806
// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
807
// hash_offset, entry->val, buff);
808
// for (old_entry = old_index->hash[hash_offset];
809
// old_entry < old_index->hash[hash_offset+1];
811
// get_text(buff, old_entry->ptr);
812
// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
813
// (int)(old_entry - old_index->hash[hash_offset]),
814
// old_entry->val, old_entry->ptr, buff);
820
/* For entries which we *do* manage to insert into old_index, we don't
821
* want them double copied into the final output.
823
old_index->num_entries++;
825
if (num_entries > 0) {
826
/* We couldn't fit the new entries into the old index, so allocate a
827
* new one, and fill it with stuff.
829
// fprintf(stderr, "inserted %d before resize\n", num_inserted);
830
new_index = create_index_from_old_and_new_entries(old_index,
834
// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
840
void free_delta_index(struct delta_index *index)
845
unsigned long sizeof_delta_index(struct delta_index *index)
848
return index->memsize;
854
* The maximum size for any opcode sequence, including the initial header
855
* plus Rabin window plus biggest copy.
857
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
860
create_delta(const struct delta_index *index,
861
const void *trg_buf, unsigned long trg_size,
862
unsigned long *delta_size, unsigned long max_size)
864
unsigned int i, outpos, outsize, moff, msize, val;
865
const struct source_info *msource;
867
const unsigned char *ref_data, *ref_top, *data, *top;
869
unsigned long source_size;
871
if (!trg_buf || !trg_size)
878
if (max_size && outsize >= max_size)
879
outsize = max_size + MAX_OP_SIZE + 1;
880
out = malloc(outsize);
884
/* store reference buffer size */
885
source_size = index->last_src->size + index->last_src->agg_offset;
888
out[outpos++] = i | 0x80;
893
/* store target buffer size */
896
out[outpos++] = i | 0x80;
902
top = (const unsigned char *) trg_buf + trg_size;
904
/* Start the matching by filling out with a simple 'insert' instruction, of
905
* the first RABIN_WINDOW bytes of the input.
907
outpos++; /* leave a byte for the insert command */
909
for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
910
out[outpos++] = *data;
911
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
913
/* we are now setup with an insert of 'i' bytes and val contains the RABIN
914
* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
924
/* we don't have a 'worthy enough' match yet, so let's look for
927
struct index_entry *entry;
928
/* Shift the window by one byte. */
929
val ^= U[data[-RABIN_WINDOW]];
930
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
931
i = val & index->hash_mask;
932
/* TODO: When using multiple indexes like this, the hash tables
933
* mapping val => index_entry become less efficient.
934
* You end up getting a lot more collisions in the hash,
935
* which doesn't actually lead to a entry->val match.
937
for (entry = index->hash[i];
938
entry < index->hash[i+1] && entry->src != NULL;
940
const unsigned char *ref;
941
const unsigned char *src;
942
unsigned int ref_size;
943
if (entry->val != val)
947
ref_data = entry->src->buf;
948
ref_top = ref_data + entry->src->size;
949
ref_size = ref_top - ref;
950
/* ref_size is the longest possible match that we could make
951
* here. If ref_size <= msize, then we know that we cannot
952
* match more bytes with this location that we have already
955
if (ref_size > top - src)
956
ref_size = top - src;
957
if (ref_size <= msize)
959
/* See how many bytes actually match at this location. */
960
while (ref_size-- && *src++ == *ref)
962
if (msize < ref - entry->ptr) {
963
/* this is our best match so far */
964
msize = ref - entry->ptr;
965
msource = entry->src;
966
moff = entry->ptr - ref_data;
967
if (msize >= 4096) /* good enough */
974
/* The best match right now is less than 4 bytes long. So just add
975
* the current byte to the insert instruction. Increment the insert
976
* counter, and copy the byte of data into the output buffer.
980
out[outpos++] = *data++;
982
if (inscnt == 0x7f) {
983
/* We have a max length insert instruction, finalize it in the
986
out[outpos - inscnt - 1] = inscnt;
995
ref_data = msource->buf;
996
while (moff && ref_data[moff-1] == data[-1]) {
997
/* we can match one byte back */
1004
outpos--; /* remove count slot */
1005
inscnt--; /* make it -1 */
1008
out[outpos - inscnt - 1] = inscnt;
1012
/* A copy op is currently limited to 64KB (pack v2) */
1013
left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1016
op = out + outpos++;
1019
/* moff is the offset in the local structure, for encoding, we need
1020
* to push it into the global offset
1022
assert(moff < msource->size);
1023
moff += msource->agg_offset;
1024
assert(moff + msize <= source_size);
1025
if (moff & 0x000000ff)
1026
out[outpos++] = moff >> 0, i |= 0x01;
1027
if (moff & 0x0000ff00)
1028
out[outpos++] = moff >> 8, i |= 0x02;
1029
if (moff & 0x00ff0000)
1030
out[outpos++] = moff >> 16, i |= 0x04;
1031
if (moff & 0xff000000)
1032
out[outpos++] = moff >> 24, i |= 0x08;
1033
/* Put it back into local coordinates, in case we have multiple
1036
moff -= msource->agg_offset;
1039
out[outpos++] = msize >> 0, i |= 0x10;
1041
out[outpos++] = msize >> 8, i |= 0x20;
1052
for (j = -RABIN_WINDOW; j < 0; j++)
1053
val = ((val << 8) | data[j])
1054
^ T[val >> RABIN_SHIFT];
1058
if (outpos >= outsize - MAX_OP_SIZE) {
1060
outsize = outsize * 3 / 2;
1061
if (max_size && outsize >= max_size)
1062
outsize = max_size + MAX_OP_SIZE + 1;
1063
if (max_size && outpos > max_size)
1065
out = realloc(out, outsize);
1074
out[outpos - inscnt - 1] = inscnt;
1076
if (max_size && outpos > max_size) {
1081
*delta_size = outpos;
1085
/* vim: et ts=4 sw=4 sts=4