516
create_index_from_old_and_new_entries(const struct delta_index *old_index,
517
struct index_entry *entries,
518
unsigned int num_entries)
520
unsigned int i, j, hsize, hmask, total_num_entries, found_null;
521
struct delta_index *index;
522
struct index_entry *entry, *packed_entry, **packed_hash;
523
struct index_entry *last_entry, null_entry = {0};
525
unsigned long memsize;
527
/* Determine index hash size. Note that indexing skips the
528
first byte to allow for optimizing the Rabin's polynomial
529
initialization in create_delta(). */
530
total_num_entries = num_entries + old_index->num_entries;
531
hsize = total_num_entries / 4;
532
for (i = 4; (1u << i) < hsize && i < 31; i++);
534
if (hsize < old_index->hash_mask) {
535
/* For some reason, there was a code path that would actually *shrink*
536
* the hash size. This screws with some later code, and in general, I
537
* think it better to make the hash bigger, rather than smaller. So
538
* we'll just force the size here.
539
* Possibly done by create_delta_index running into a
540
* limit_hash_buckets call, that ended up transitioning across a
541
* power-of-2. The cause isn't 100% clear, though.
543
hsize = old_index->hash_mask + 1;
546
// fprintf(stderr, "resizing index to insert %d entries into array"
547
// " with %d entries: %x => %x\n",
548
// num_entries, old_index->num_entries, old_index->hash_mask, hmask);
550
memsize = sizeof(*index)
551
+ sizeof(*packed_hash) * (hsize+1)
552
+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
553
mem = malloc(memsize);
558
index->memsize = memsize;
559
index->hash_mask = hmask;
560
index->num_entries = total_num_entries;
561
index->last_src = old_index->last_src;
565
mem = packed_hash + (hsize+1);
568
last_entry = entries + num_entries;
569
for (i = 0; i < hsize; i++) {
571
* Coalesce all entries belonging in one hash bucket
572
* into consecutive array entries.
573
* The entries in old_index all come before 'entries'.
575
packed_hash[i] = packed_entry;
576
/* Copy any of the old entries across */
577
/* Would we rather use memcpy? */
578
if (hmask == old_index->hash_mask) {
580
for (entry = old_index->hash[i];
581
entry < old_index->hash[i+1];
583
if (entry->ptr == NULL) {
586
} else if (found_null) {
587
fprintf(stderr, "Found a non-null entry after a"
593
assert((entry->val & hmask) == i);
594
*packed_entry++ = *entry;
597
/* If we resized the index from this action, all of the old values
598
* will be found in the previous location, but they will end up
599
* spread across the new locations.
601
j = i & old_index->hash_mask;
603
for (entry = old_index->hash[j];
604
entry < old_index->hash[j+1];
606
if (entry->ptr == NULL) {
609
} else if (found_null) {
610
fprintf(stderr, "Found a non-null entry after a"
612
" offset %x for new offset %x"
616
assert((entry->val & old_index->hash_mask) == j);
617
if (!(j == i || j + old_index->hash_mask + 1 == i)) {
618
fprintf(stderr, "Old hash entry %x"
619
" doesn't fit with new %x\n"
620
"old_mask: %x new_mask: %x\n",
621
j, i, old_index->hash_mask, hmask);
623
assert(j == i || j + old_index->hash_mask + 1 == i);
624
if ((entry->val & hmask) == i) {
625
/* Any entries not picked up here will be picked up on the
628
*packed_entry++ = *entry;
632
/* Now see if we need to insert any of the new entries.
633
* Note that loop ends up O(hsize*num_entries), so we expect that
634
* num_entries is always small.
635
* We also help a little bit by collapsing the entry range when the
636
* endpoints are inserted. However, an alternative would be to build a
637
* quick hash lookup for just the new entries.
638
* Testing shows that this list can easily get up to about 100
639
* entries, the tradeoff is a malloc, 1 pass over the entries, copying
640
* them into a sorted buffer, and a free() when done,
642
for (entry = entries; entry < last_entry; ++entry) {
643
if (entry->ptr != NULL && ((entry->val & hmask) == i)) {
644
/* A non-empty entry, which fits in this hash bucket */
645
*packed_entry++ = *entry;
647
if (entry == entries) {
648
/* This is the first entry in the list, skip all of the
649
* null entries from the front.
651
while (entries < last_entry && entries->ptr == 0) {
653
/* we can also skip processing any of these entries in
658
/* But we need to adjust back one, because the loop will
662
} else if (entry == (last_entry - 1)) {
663
/* This is the last entry, remove all the trailing null
664
* entries from the list.
667
while (last_entry > entries && last_entry->ptr == NULL) {
674
/* Now insert some extra nulls */
675
for (j = 0; j < EXTRA_NULLS; ++j) {
676
*packed_entry++ = null_entry;
680
/* Sentinel value to indicate the length of the last hash bucket */
681
packed_hash[hsize] = packed_entry;
683
if ((packed_entry - (struct index_entry *)mem)
684
!= (total_num_entries + hsize*EXTRA_NULLS)) {
685
fprintf(stderr, "We expected %d entries, but created %d\n",
686
total_num_entries + hsize*EXTRA_NULLS,
687
(int)(packed_entry - (struct index_entry*)mem));
690
assert((packed_entry - (struct index_entry *)mem)
691
== (total_num_entries + hsize * EXTRA_NULLS));
692
index->last_entry = (packed_entry - 1);
698
get_text(char buff[128], const unsigned char *ptr)
701
const unsigned char *start;
703
start = (ptr-RABIN_WINDOW-1);
705
if (cmd < 0x80) {// This is likely to be an insert instruction
706
if (cmd < RABIN_WINDOW) {
710
/* This was either a copy [should never be] or it
711
* was a longer insert so the insert start happened at 16 more
714
cmd = RABIN_WINDOW + 1;
717
cmd = 60; /* Be friendly to 80char terms */
719
/* Copy the 1 byte command, and 4 bytes after the insert */
721
memcpy(buff, start, cmd);
723
for (i = 0; i < cmd; ++i) {
724
if (buff[i] == '\n') {
726
} else if (buff[i] == '\t') {
505
732
struct delta_index *
506
733
create_delta_index_from_delta(const struct source_info *src,
507
const struct delta_index *old)
734
struct delta_index *old_index)
509
unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
510
unsigned int total_num_entries;
736
unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
737
unsigned int hash_offset;
511
738
const unsigned char *data, *buffer, *top;
512
739
unsigned char cmd;
513
struct delta_index *index;
514
struct unpacked_index_entry *entry, **hash;
516
unsigned long memsize;
740
struct delta_index *new_index;
741
struct index_entry *entry, *entries, *old_entry;
518
743
if (!src->buf || !src->size)
527
752
actual number will be recomputed during processing.
530
num_entries = (src->size - 1) / RABIN_WINDOW;
532
total_num_entries = num_entries + old->num_entries;
534
total_num_entries = num_entries;
535
hsize = total_num_entries / 4;
536
for (i = 4; (1u << i) < hsize && i < 31; i++);
540
/* allocate lookup index */
541
memsize = sizeof(*hash) * hsize +
542
sizeof(*entry) * total_num_entries;
543
mem = malloc(memsize);
550
memset(hash, 0, hsize * sizeof(*hash));
552
/* allocate an array to count hash num_entries */
553
hash_count = calloc(hsize, sizeof(*hash_count));
755
max_num_entries = (src->size - 1) / RABIN_WINDOW;
757
/* allocate an array to hold whatever entries we find */
758
entries = malloc(sizeof(*entry) * max_num_entries);
759
if (!entries) /* malloc failure */
559
762
/* then populate the index for the new data */
560
/* The create_delta_index code starts at the end and works backward,
561
* because it is easier to update the entry pointers (you always update the
562
* head, rather than updating the tail).
563
* However, we can't walk the delta structure that way.
567
765
/* source size */
568
766
get_delta_hdr_size(&data, top);
569
767
/* target size */
570
768
get_delta_hdr_size(&data, top);
769
entry = entries; /* start at the first slot */
571
770
num_entries = 0; /* calculate the real number of entries */
572
771
while (data < top) {
631
819
if (data != top) {
632
820
/* Something was wrong with this delta */
637
824
if (num_entries == 0) {
638
825
/** Nothing to index **/
644
if (hmask == old->hash_mask) {
645
/* The total hash table size didn't change, which means that
646
* entries will end up in the same pages. We can do bulk-copying to
647
* get the final output
649
index = create_index_from_old_and_hash(old, hash, hsize,
656
index->last_src = src;
659
total_num_entries = num_entries + old->num_entries;
660
include_entries_from_index(hash, hash_count, hsize, entry, old);
829
assert(old_index != NULL);
830
old_index->last_src = src;
831
/* See if we can fill in these values into the holes in the array */
834
for (; num_entries > 0; --num_entries, ++entry) {
835
hash_offset = (entry->val & old_index->hash_mask);
836
/* The basic structure is a hash => packed_entries that fit in that
837
* hash bucket. Things are structured such that the hash-pointers are
838
* strictly ordered. So we start by pointing to the next pointer, and
839
* walk back until we stop getting NULL targets, and then go back
840
* forward. If there are no NULL targets, then we know because
841
* entry->ptr will not be NULL.
843
old_entry = old_index->hash[hash_offset + 1];
845
while (old_entry->ptr == NULL
846
&& old_entry >= old_index->hash[hash_offset]) {
850
if (old_entry->ptr != NULL
851
|| old_entry >= old_index->hash[hash_offset + 1]) {
853
// get_text(buff, entry->ptr);
854
// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
855
// hash_offset, entry->val, buff);
856
// for (old_entry = old_index->hash[hash_offset];
857
// old_entry < old_index->hash[hash_offset+1];
859
// get_text(buff, old_entry->ptr);
860
// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",
861
// (int)(old_entry - old_index->hash[hash_offset]),
862
// old_entry->val, old_entry->ptr, buff);
868
/* For entries which we *do* manage to insert into old_index, we don't
869
* want them double copied into the final output.
871
old_index->num_entries++;
873
if (num_entries > 0) {
874
/* We couldn't fit the new entries into the old index, so allocate a
875
* new one, and fill it with stuff.
877
// fprintf(stderr, "inserted %d before resize\n", num_inserted);
878
new_index = create_index_from_old_and_new_entries(old_index,
663
total_num_entries = num_entries;
666
total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
669
index = pack_delta_index(hash, hsize, total_num_entries);
674
index->last_src = src;
882
// fprintf(stderr, "inserted %d without resizing\n", num_inserted);
678
888
void free_delta_index(struct delta_index *index)