198
211
static struct delta_index *
199
212
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
200
unsigned int num_entries)
213
unsigned int num_entries, struct delta_index *old_index)
202
unsigned int i, memsize;
215
unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
203
216
struct unpacked_index_entry *entry;
204
217
struct delta_index *index;
205
struct index_entry *packed_entry, **packed_hash;
218
struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
219
struct index_entry null_entry = {0};
225
// fprintf(stderr, "Packing %d entries into %d for total of %d entries"
227
// num_entries - old_index->num_entries,
228
// old_index->num_entries, num_entries,
229
// old_index->hash_mask, hmask);
231
// fprintf(stderr, "Packing %d entries into a new index\n",
234
/* First, see if we can squeeze the new items into the existing structure.
238
if (old_index && old_index->hash_mask == hmask) {
240
for (i = 0; i < hsize; ++i) {
242
for (entry = hash[i]; entry; entry = entry->next) {
243
if (packed_entry == NULL) {
244
/* Find the last open spot */
245
packed_entry = old_index->hash[i + 1];
247
while (packed_entry >= old_index->hash[i]
248
&& packed_entry->ptr == NULL) {
253
if (packed_entry >= old_index->hash[i+1]
254
|| packed_entry->ptr != NULL) {
255
/* There are no free spots here :( */
259
/* We found an empty spot to put this entry
260
* Copy it over, and remove it from the linked list, just in
261
* case we end up running out of room later.
263
*packed_entry++ = entry->entry;
264
assert(entry == hash[i]);
265
hash[i] = entry->next;
267
old_index->num_entries++;
276
// fprintf(stderr, "Fit all %d entries into old index\n",
278
/* No need to allocate a new buffer */
281
// fprintf(stderr, "Fit only %d entries into old index,"
282
// " reallocating\n", copied_count);
208
286
* Now create the packed index in array form
209
287
* rather than linked lists.
288
* Leave a 2-entry gap for inserting more entries between the groups
211
290
memsize = sizeof(*index)
212
291
+ sizeof(*packed_hash) * (hsize+1)
213
+ sizeof(*packed_entry) * num_entries;
292
+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
214
293
mem = malloc(memsize);
220
299
index->memsize = memsize;
221
index->hash_mask = hsize - 1;
300
index->hash_mask = hmask;
222
301
index->num_entries = num_entries;
226
mem = packed_hash + (hsize+1);
229
for (i = 0; i < hsize; i++) {
231
* Coalesce all entries belonging to one linked list
232
* into consecutive array entries.
234
packed_hash[i] = packed_entry;
235
for (entry = hash[i]; entry; entry = entry->next)
236
*packed_entry++ = entry->entry;
239
/* Sentinel value to indicate the length of the last hash bucket */
240
packed_hash[hsize] = packed_entry;
242
assert(packed_entry - (struct index_entry *)mem == num_entries);
243
index->last_entry = (packed_entry - 1);
247
void include_entries_from_index(struct unpacked_index_entry **hash,
248
unsigned int *hash_count,
250
struct unpacked_index_entry *entry,
251
const struct delta_index *old)
253
unsigned int i, hmask, masked_val;
254
struct index_entry *old_entry;
257
/* Iterate backwards through the existing entries
258
* This is so that matches early in the file end up being the first pointer
259
* in the linked list.
260
* TODO: We know that all old entries are going to occur before the new
261
* entries, and that we are going to follow this with a limit and
262
* then pack step. There is probably a way we could optimize this
263
* step, so that we wouldn't have to copy everything into a new
264
* buffer, and then copy everything again into yet another buffer.
266
for (old_entry = old->last_entry, i = 0; i < old->num_entries;
268
masked_val = old_entry->val & hmask;
269
entry->entry = *old_entry;
270
entry->next = hash[masked_val];
271
hash[masked_val] = entry++;
272
hash_count[masked_val]++;
277
create_index_from_old_and_hash(const struct delta_index *old,
278
struct unpacked_index_entry **hash,
280
unsigned int num_entries)
282
unsigned int i, memsize, total_num_entries;
283
struct unpacked_index_entry *entry;
284
struct delta_index *index;
285
struct index_entry *packed_entry, **packed_hash;
286
struct index_entry *copy_from_start, *copy_to_start;
287
size_t total_copy, to_copy;
288
unsigned int num_ops;
289
unsigned int bytes_copied;
292
* Now create the packed index in array form
293
* rather than linked lists.
295
total_num_entries = num_entries + old->num_entries;
296
memsize = sizeof(*index)
297
+ sizeof(*packed_hash) * (hsize+1)
298
+ sizeof(*packed_entry) * total_num_entries;
299
mem = malloc(memsize);
305
index->memsize = memsize;
306
index->hash_mask = hsize - 1;
307
index->num_entries = total_num_entries;
311
mem = packed_hash + (hsize+1);
317
copy_from_start = copy_to_start = NULL;
318
for (i = 0; i < hsize; i++) {
320
* Coalesce all entries belonging to one linked list
321
* into consecutive array entries.
322
* The entries in old all come before the entries in hash.
324
packed_hash[i] = packed_entry;
325
to_copy = (old->hash[i+1] - old->hash[i]);
327
/* This next range can be copied wholesale. However, we will wait
328
* until we have to insert something from the new data before we
330
* So for now, just reserve space for the copy.
303
if (hmask < old_index->hash_mask) {
304
fprintf(stderr, "hash mask was shrunk %x => %x\n",
305
old_index->hash_mask, hmask);
307
assert(hmask >= old_index->hash_mask);
312
mem = packed_hash + (hsize+1);
315
for (i = 0; i < hsize; i++) {
317
* Coalesce all entries belonging to one linked list
318
* into consecutive array entries.
320
packed_hash[i] = packed_entry;
321
/* Old comes earlier as a source, so it always comes first in a given
325
/* Could we optimize this to use memcpy when hmask ==
326
* old_index->hash_mask? Would it make any real difference?
332
if (total_copy == 0) {
333
copy_from_start = old->hash[i];
334
copy_to_start = packed_entry;
328
j = i & old_index->hash_mask;
329
copy_from = old_index->hash[j];
330
for (old_entry = old_index->hash[j];
331
old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
333
if ((old_entry->val & hmask) == i) {
334
*packed_entry++ = *old_entry;
336
packed_entry += to_copy;
337
total_copy += to_copy;
338
assert((packed_entry - copy_to_start) == total_copy);
339
assert((old->hash[i+1] - copy_from_start) == total_copy);
341
338
for (entry = hash[i]; entry; entry = entry->next) {
342
/* We have an entry to insert, so flush the copy buffer */
343
if (total_copy > 0) {
344
assert(copy_to_start != NULL);
345
assert(copy_from_start != NULL);
346
memcpy(copy_to_start, copy_from_start,
347
total_copy * sizeof(struct index_entry));
348
bytes_copied += (total_copy * sizeof(struct index_entry));
350
copy_from_start = copy_to_start = NULL;
353
339
*packed_entry++ = entry->entry;
356
if (total_copy > 0) {
357
assert(copy_to_start != NULL);
358
assert(copy_from_start != NULL);
359
memcpy(copy_to_start, copy_from_start,
360
total_copy * sizeof(struct index_entry));
361
bytes_copied += (total_copy * sizeof(struct index_entry));
341
/* TODO: At this point packed_entry - packed_hash[i] is the number of
342
* records that we have inserted into this hash bucket.
343
* We should *really* consider doing some limiting along the
344
* lines of limit_hash_buckets() to avoid pathological behavior.
346
/* Now add extra 'NULL' entries that we can use for future expansion. */
347
for (j = 0; j < EXTRA_NULLS; ++j ) {
348
*packed_entry++ = null_entry;
365
352
/* Sentinel value to indicate the length of the last hash bucket */
366
353
packed_hash[hsize] = packed_entry;
368
assert(packed_entry - (struct index_entry *)mem == total_num_entries);
355
if (packed_entry - (struct index_entry *)mem
356
!= num_entries + hsize*EXTRA_NULLS) {
357
fprintf(stderr, "We expected %d entries, but created %d\n",
358
num_entries + hsize*EXTRA_NULLS,
359
(int)(packed_entry - (struct index_entry*)mem));
361
assert(packed_entry - (struct index_entry *)mem
362
== num_entries + hsize*EXTRA_NULLS);
369
363
index->last_entry = (packed_entry - 1);
374
struct delta_index * create_delta_index(const struct source_info *src,
375
const struct delta_index *old)
369
create_delta_index(const struct source_info *src,
370
struct delta_index *old)
377
372
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
378
373
unsigned int total_num_entries;
445
/* If we have an existing delta_index, we want to bring its info into the
446
* new structure. We assume that the existing structure's objects all occur
447
* before the new source, so they get first precedence in the index.
450
include_entries_from_index(hash, hash_count, hsize, entry, old);
444
/* TODO: It would be nice to limit_hash_buckets at a better time. */
452
445
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
453
446
total_num_entries);
454
447
free(hash_count);
455
index = pack_delta_index(hash, hsize, total_num_entries);
456
index->last_src = src;
451
index = pack_delta_index(hash, hsize, total_num_entries, old);
456
index->last_src = src;
460
/* Take some entries, and put them into a custom hash.
461
* @param entries A list of entries, sorted by position in file
462
* @param num_entries Length of entries
463
* @param out_hsize The maximum size of the hash, the final size will be
466
struct index_entry_linked_list **
467
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
470
unsigned int hash_offset, hmask, memsize;
471
struct index_entry *entry, *last_entry;
472
struct index_entry_linked_list *out_entry, **hash;
477
memsize = sizeof(*hash) * hsize +
478
sizeof(*out_entry) * num_entries;
479
mem = malloc(memsize);
486
memset(hash, 0, sizeof(*hash)*(hsize+1));
488
/* We know that entries are in the order we want in the output, but they
489
* aren't "grouped" by hash bucket yet.
491
last_entry = entries + num_entries;
492
for (entry = entries + num_entries - 1; entry >= entries; --entry) {
493
hash_offset = entry->val & hmask;
494
out_entry->p_entry = entry;
495
out_entry->next = hash[hash_offset];
496
/* TODO: Remove entries that have identical vals, or at least filter
497
* the map a little bit.
498
* if (hash[i] != NULL) {
501
hash[hash_offset] = out_entry;
509
create_index_from_old_and_new_entries(const struct delta_index *old_index,
510
struct index_entry *entries,
511
unsigned int num_entries)
513
unsigned int i, j, hsize, hmask, total_num_entries;
514
struct delta_index *index;
515
struct index_entry *entry, *packed_entry, **packed_hash;
516
struct index_entry *last_entry, null_entry = {0};
518
unsigned long memsize;
519
struct index_entry_linked_list *unpacked_entry, **mini_hash;
521
/* Determine index hash size. Note that indexing skips the
522
first byte to allow for optimizing the Rabin's polynomial
523
initialization in create_delta(). */
524
total_num_entries = num_entries + old_index->num_entries;
525
hsize = total_num_entries / 4;
526
for (i = 4; (1u << i) < hsize && i < 31; i++);
528
if (hsize < old_index->hash_mask) {
529
/* For some reason, there was a code path that would actually *shrink*
530
* the hash size. This screws with some later code, and in general, I
531
* think it better to make the hash bigger, rather than smaller. So
532
* we'll just force the size here.
533
* Possibly done by create_delta_index running into a
534
* limit_hash_buckets call, that ended up transitioning across a
535
* power-of-2. The cause isn't 100% clear, though.
537
hsize = old_index->hash_mask + 1;
540
// fprintf(stderr, "resizing index to insert %d entries into array"
541
// " with %d entries: %x => %x\n",
542
// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
544
memsize = sizeof(*index)
545
+ sizeof(*packed_hash) * (hsize+1)
546
+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
547
mem = malloc(memsize);
552
index->memsize = memsize;
553
index->hash_mask = hmask;
554
index->num_entries = total_num_entries;
555
index->last_src = old_index->last_src;
559
mem = packed_hash + (hsize+1);
562
mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
563
if (mini_hash == NULL) {
567
last_entry = entries + num_entries;
568
for (i = 0; i < hsize; i++) {
570
* Coalesce all entries belonging in one hash bucket
571
* into consecutive array entries.
572
* The entries in old_index all come before 'entries'.
574
packed_hash[i] = packed_entry;
575
/* Copy any of the old entries across */
576
/* Would we rather use memcpy? */
577
if (hmask == old_index->hash_mask) {
578
for (entry = old_index->hash[i];
579
entry < old_index->hash[i+1] && entry->ptr != NULL;
581
assert((entry->val & hmask) == i);
582
*packed_entry++ = *entry;
585
/* If we resized the index from this action, all of the old values
586
* will be found in the previous location, but they will end up
587
* spread across the new locations.
589
j = i & old_index->hash_mask;
590
for (entry = old_index->hash[j];
591
entry < old_index->hash[j+1] && entry->ptr != NULL;
593
assert((entry->val & old_index->hash_mask) == j);
594
if ((entry->val & hmask) == i) {
595
/* Any entries not picked up here will be picked up on the
598
*packed_entry++ = *entry;
602
/* Now see if we need to insert any of the new entries.
603
* Note that loop ends up O(hsize*num_entries), so we expect that
604
* num_entries is always small.
605
* We also help a little bit by collapsing the entry range when the
606
* endpoints are inserted. However, an alternative would be to build a
607
* quick hash lookup for just the new entries.
608
* Testing shows that this list can easily get up to about 100
609
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
610
* them into a sorted buffer, and a free() when done,
612
for (unpacked_entry = mini_hash[i];
614
unpacked_entry = unpacked_entry->next) {
615
assert((unpacked_entry->p_entry->val & hmask) == i);
616
*packed_entry++ = *(unpacked_entry->p_entry);
618
/* Now insert some extra nulls */
619
for (j = 0; j < EXTRA_NULLS; ++j) {
620
*packed_entry++ = null_entry;
625
/* Sentinel value to indicate the length of the last hash bucket */
626
packed_hash[hsize] = packed_entry;
628
if ((packed_entry - (struct index_entry *)mem)
629
!= (total_num_entries + hsize*EXTRA_NULLS)) {
630
fprintf(stderr, "We expected %d entries, but created %d\n",
631
total_num_entries + hsize*EXTRA_NULLS,
632
(int)(packed_entry - (struct index_entry*)mem));
635
assert((packed_entry - (struct index_entry *)mem)
636
== (total_num_entries + hsize * EXTRA_NULLS));
637
index->last_entry = (packed_entry - 1);
643
get_text(char buff[128], const unsigned char *ptr)
646
const unsigned char *start;
648
start = (ptr-RABIN_WINDOW-1);
650
if (cmd < 0x80) {// This is likely to be an insert instruction
651
if (cmd < RABIN_WINDOW) {
655
/* This was either a copy [should never be] or it
656
* was a longer insert so the insert start happened at 16 more
659
cmd = RABIN_WINDOW + 1;
662
cmd = 60; /* Be friendly to 80char terms */
664
/* Copy the 1 byte command, and 4 bytes after the insert */
666
memcpy(buff, start, cmd);
668
for (i = 0; i < cmd; ++i) {
669
if (buff[i] == '\n') {
671
} else if (buff[i] == '\t') {
464
677
struct delta_index *
465
678
create_delta_index_from_delta(const struct source_info *src,
466
const struct delta_index *old)
679
struct delta_index *old_index)
468
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
469
unsigned int total_num_entries;
681
unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
682
unsigned int hash_offset;
470
683
const unsigned char *data, *buffer, *top;
471
684
unsigned char cmd;
472
struct delta_index *index;
473
struct unpacked_index_entry *entry, **hash;
475
unsigned long memsize;
685
struct delta_index *new_index;
686
struct index_entry *entry, *entries, *old_entry;
477
688
if (!src->buf || !src->size)
590
770
if (data != top) {
591
771
/* Something was wrong with this delta */
596
775
if (num_entries == 0) {
597
776
/** Nothing to index **/
603
if (hmask == old->hash_mask) {
604
/* The total hash table size didn't change, which means that
605
* entries will end up in the same pages. We can do bulk-copying to
606
* get the final output
608
index = create_index_from_old_and_hash(old, hash, hsize,
615
index->last_src = src;
618
total_num_entries = num_entries + old->num_entries;
619
include_entries_from_index(hash, hash_count, hsize, entry, old);
780
assert(old_index != NULL);
781
old_index->last_src = src;
782
/* See if we can fill in these values into the holes in the array */
785
for (; num_entries > 0; --num_entries, ++entry) {
786
hash_offset = (entry->val & old_index->hash_mask);
787
/* The basic structure is a hash => packed_entries that fit in that
788
* hash bucket. Things are structured such that the hash-pointers are
789
* strictly ordered. So we start by pointing to the next pointer, and
790
* walk back until we stop getting NULL targets, and then go back
791
* forward. If there are no NULL targets, then we know because
792
* entry->ptr will not be NULL.
794
old_entry = old_index->hash[hash_offset + 1];
796
while (old_entry->ptr == NULL
797
&& old_entry >= old_index->hash[hash_offset]) {
801
if (old_entry->ptr != NULL
802
|| old_entry >= old_index->hash[hash_offset + 1]) {
803
/* There is no room for this entry, we have to resize */
805
// get_text(buff, entry->ptr);
806
// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
807
// hash_offset, entry->val, buff);
808
// for (old_entry = old_index->hash[hash_offset];
809
// old_entry < old_index->hash[hash_offset+1];
811
// get_text(buff, old_entry->ptr);
812
// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
813
// (int)(old_entry - old_index->hash[hash_offset]),
814
// old_entry->val, old_entry->ptr, buff);
820
/* For entries which we *do* manage to insert into old_index, we don't
821
* want them double copied into the final output.
823
old_index->num_entries++;
825
if (num_entries > 0) {
826
/* We couldn't fit the new entries into the old index, so allocate a
827
* new one, and fill it with stuff.
829
// fprintf(stderr, "inserted %d before resize\n", num_inserted);
830
new_index = create_index_from_old_and_new_entries(old_index,
622
total_num_entries = num_entries;
625
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
628
index = pack_delta_index(hash, hsize, total_num_entries);
633
index->last_src = src;
834
// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
637
840
void free_delta_index(struct delta_index *index)