~bzr-pqm/bzr/bzr.dev

0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1
/*
2
 * diff-delta.c: generate a delta between two buffers
3
 *
4
 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
 * http://www.xmailserver.org/xdiff-lib.html
6
 *
7
 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
8
 * Adapted for Bazaar by John Arbash Meinel <john@arbash-meinel.com> (C) 2009
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
9
 *
4241.6.6 by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
Groupcompress from brisbane-core.
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * NB: The version in GIT is 'version 2 of the Licence only', however Nicolas
16
 * has granted permission for use under 'version 2 or later' in private email
17
 * to Robert Collins and Karl Fogel on the 6th April 2009.
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
18
 */
19
3735.33.4 by John Arbash Meinel
The new layout is working.
20
#include <stdio.h>
21
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
22
#include "delta.h"
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
23
#include <stdlib.h>
24
#include <string.h>
0.23.5 by John Arbash Meinel
Minor changes to get diff-delta.c and patch-delta.c to compile.
25
#include <assert.h>
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
26
27
/* maximum hash entry list for the same hash bucket */
28
#define HASH_LIMIT 64
29
30
#define RABIN_SHIFT 23
31
#define RABIN_WINDOW 16
32
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
33
/* The hash map is sized to put 4 entries per bucket, this gives us ~even room
34
 * for more data. Tweaking this number above 4 doesn't seem to help much,
35
 * anyway.
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
36
 */
3735.33.13 by John Arbash Meinel
Tweak the number of blank spaces up just a tad.
37
#define EXTRA_NULLS 4
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
38
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
39
static const unsigned int T[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
40
    0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
41
    0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
42
    0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
43
    0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
44
    0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
45
    0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
46
    0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
47
    0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
48
    0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
49
    0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
50
    0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
51
    0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
52
    0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
53
    0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
54
    0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
55
    0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
56
    0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
57
    0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
58
    0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
59
    0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
60
    0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
61
    0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
62
    0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
63
    0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
64
    0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
65
    0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
66
    0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
67
    0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
68
    0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
69
    0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
70
    0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
71
    0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
72
    0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
73
    0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
74
    0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
75
    0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
76
    0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
77
    0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
78
    0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
79
    0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
80
    0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
81
    0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
82
    0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
83
};
84
85
static const unsigned int U[256] = {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
86
    0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
87
    0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
88
    0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
89
    0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
90
    0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
91
    0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
92
    0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
93
    0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
94
    0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
95
    0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
96
    0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
97
    0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
98
    0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
99
    0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
100
    0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
101
    0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
102
    0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
103
    0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
104
    0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
105
    0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
106
    0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
107
    0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
108
    0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
109
    0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
110
    0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
111
    0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
112
    0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
113
    0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
114
    0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
115
    0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
116
    0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
117
    0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
118
    0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
119
    0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
120
    0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
121
    0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
122
    0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
123
    0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
124
    0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
125
    0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
126
    0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
127
    0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
128
    0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
129
};
130
131
struct index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
132
    const unsigned char *ptr;
133
    const struct source_info *src;
134
    unsigned int val;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
135
};
136
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
137
struct index_entry_linked_list {
138
    struct index_entry *p_entry;
139
    struct index_entry_linked_list *next;
140
};
141
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
142
struct unpacked_index_entry {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
143
    struct index_entry entry;
144
    struct unpacked_index_entry *next;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
145
};
146
147
struct delta_index {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
148
    unsigned long memsize; /* Total bytes pointed to by this index */
149
    const struct source_info *last_src; /* Information about the referenced source */
150
    unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
151
                               entry */
152
    unsigned int num_entries; /* The total number of entries in this index */
153
    struct index_entry *last_entry; /* Pointer to the last valid entry */
154
    struct index_entry *hash[];
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
155
};
156
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
157
static unsigned int
158
limit_hash_buckets(struct unpacked_index_entry **hash,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
159
                   unsigned int *hash_count, unsigned int hsize,
160
                   unsigned int entries)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
161
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
162
    struct unpacked_index_entry *entry;
163
    unsigned int i;
164
    /*
165
     * Determine a limit on the number of entries in the same hash
166
     * bucket.  This guards us against pathological data sets causing
167
     * really bad hash distribution with most entries in the same hash
168
     * bucket that would bring us to O(m*n) computing costs (m and n
169
     * corresponding to reference and target buffer sizes).
170
     *
171
     * Make sure none of the hash buckets has more entries than
172
     * we're willing to test.  Otherwise we cull the entry list
173
     * uniformly to still preserve a good repartition across
174
     * the reference buffer.
175
     */
176
    for (i = 0; i < hsize; i++) {
177
        int acc;
178
179
        if (hash_count[i] <= HASH_LIMIT)
180
            continue;
181
182
        /* We leave exactly HASH_LIMIT entries in the bucket */
183
        entries -= hash_count[i] - HASH_LIMIT;
184
185
        entry = hash[i];
186
        acc = 0;
187
188
        /*
189
         * Assume that this loop is gone through exactly
190
         * HASH_LIMIT times and is entered and left with
191
         * acc==0.  So the first statement in the loop
192
         * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
193
         * to the accumulator, and the inner loop consequently
194
         * is run (hash_count[i]-HASH_LIMIT) times, removing
195
         * one element from the list each time.  Since acc
196
         * balances out to 0 at the final run, the inner loop
197
         * body can't be left with entry==NULL.  So we indeed
198
         * encounter entry==NULL in the outer loop only.
199
         */
200
        do {
201
            acc += hash_count[i] - HASH_LIMIT;
202
            if (acc > 0) {
203
                struct unpacked_index_entry *keep = entry;
204
                do {
205
                    entry = entry->next;
206
                    acc -= HASH_LIMIT;
207
                } while (acc > 0);
208
                keep->next = entry->next;
209
            }
210
            entry = entry->next;
211
        } while (entry);
212
    }
213
    return entries;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
214
}
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
215
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
216
static struct delta_index *
217
pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
218
                 unsigned int num_entries, struct delta_index *old_index)
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
219
{
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
220
    unsigned int i, j, hmask, memsize, fit_in_old, copied_count;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
221
    struct unpacked_index_entry *entry;
222
    struct delta_index *index;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
223
    struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
224
    struct index_entry null_entry = {0};
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
225
    void *mem;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
226
227
    hmask = hsize - 1;
228
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
229
    // if (old_index) {
230
    //     fprintf(stderr, "Packing %d entries into %d for total of %d entries"
231
    //                     " %x => %x\n",
232
    //                     num_entries - old_index->num_entries,
233
    //                     old_index->num_entries, num_entries,
234
    //                     old_index->hash_mask, hmask);
235
    // } else {
236
    //     fprintf(stderr, "Packing %d entries into a new index\n",
237
    //                     num_entries);
238
    // }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
239
    /* First, see if we can squeeze the new items into the existing structure.
240
     */
241
    fit_in_old = 0;
242
    copied_count = 0;
243
    if (old_index && old_index->hash_mask == hmask) {
244
        fit_in_old = 1;
245
        for (i = 0; i < hsize; ++i) {
246
            packed_entry = NULL;
247
            for (entry = hash[i]; entry; entry = entry->next) {
248
                if (packed_entry == NULL) {
249
                    /* Find the last open spot */
250
                    packed_entry = old_index->hash[i + 1];
251
                    --packed_entry;
252
                    while (packed_entry >= old_index->hash[i]
253
                           && packed_entry->ptr == NULL) {
254
                        --packed_entry;
255
                    }
256
                    ++packed_entry;
257
                }
258
                if (packed_entry >= old_index->hash[i+1]
259
                    || packed_entry->ptr != NULL) {
260
                    /* There are no free spots here :( */
261
                    fit_in_old = 0;
262
                    break;
263
                }
264
                /* We found an empty spot to put this entry
265
                 * Copy it over, and remove it from the linked list, just in
266
                 * case we end up running out of room later.
267
                 */
268
                *packed_entry++ = entry->entry;
269
                assert(entry == hash[i]);
270
                hash[i] = entry->next;
271
                copied_count += 1;
272
                old_index->num_entries++;
273
            }
274
            if (!fit_in_old) {
275
                break;
276
            }
277
        }
278
    }
279
    if (old_index) {
280
        if (fit_in_old) {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
281
            // fprintf(stderr, "Fit all %d entries into old index\n",
282
            //                 copied_count);
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
283
            /* No need to allocate a new buffer */
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
284
            return NULL;
285
        } else {
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
286
            // fprintf(stderr, "Fit only %d entries into old index,"
287
            //                 " reallocating\n", copied_count);
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
288
        }
289
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
290
    /*
291
     * Now create the packed index in array form
292
     * rather than linked lists.
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
293
     * Leave a 2-entry gap for inserting more entries between the groups
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
294
     */
295
    memsize = sizeof(*index)
296
        + sizeof(*packed_hash) * (hsize+1)
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
297
        + sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
298
    mem = malloc(memsize);
299
    if (!mem) {
300
        return NULL;
301
    }
302
303
    index = mem;
304
    index->memsize = memsize;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
305
    index->hash_mask = hmask;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
306
    index->num_entries = num_entries;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
307
    if (old_index) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
308
        if (hmask < old_index->hash_mask) {
309
            fprintf(stderr, "hash mask was shrunk %x => %x\n",
310
                            old_index->hash_mask, hmask);
311
        }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
312
        assert(hmask >= old_index->hash_mask);
313
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
314
315
    mem = index->hash;
316
    packed_hash = mem;
317
    mem = packed_hash + (hsize+1);
318
    packed_entry = mem;
319
320
    for (i = 0; i < hsize; i++) {
321
        /*
322
         * Coalesce all entries belonging to one linked list
323
         * into consecutive array entries.
324
         */
325
        packed_hash[i] = packed_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
326
        /* Old comes earlier as a source, so it always comes first in a given
327
         * hash bucket.
328
         */
329
        if (old_index) {
330
            /* Could we optimize this to use memcpy when hmask ==
331
             * old_index->hash_mask? Would it make any real difference?
332
             */
333
            j = i & old_index->hash_mask;
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
334
            copy_from = old_index->hash[j];
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
335
            for (old_entry = old_index->hash[j];
336
                 old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;
337
                 old_entry++) {
338
                if ((old_entry->val & hmask) == i) {
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
339
                    *packed_entry++ = *old_entry;
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
340
                }
341
            }
342
        }
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
343
        for (entry = hash[i]; entry; entry = entry->next) {
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
344
            *packed_entry++ = entry->entry;
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
345
        }
3735.33.14 by John Arbash Meinel
Simplify the code a bit. We don't repack as often, so go with a
346
        /* TODO: At this point packed_entry - packed_hash[i] is the number of
347
         *       records that we have inserted into this hash bucket.
348
         *       We should *really* consider doing some limiting along the
349
         *       lines of limit_hash_buckets() to avoid pathological behavior.
350
         */
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
351
        /* Now add extra 'NULL' entries that we can use for future expansion. */
352
        for (j = 0; j < EXTRA_NULLS; ++j ) {
353
            *packed_entry++ = null_entry;
354
        }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
355
    }
356
357
    /* Sentinel value to indicate the length of the last hash bucket */
358
    packed_hash[hsize] = packed_entry;
359
3735.33.4 by John Arbash Meinel
The new layout is working.
360
    if (packed_entry - (struct index_entry *)mem
361
        != num_entries + hsize*EXTRA_NULLS) {
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
362
        fprintf(stderr, "We expected %d entries, but created %d\n",
363
                num_entries + hsize*EXTRA_NULLS,
364
                (int)(packed_entry - (struct index_entry*)mem));
3735.33.4 by John Arbash Meinel
The new layout is working.
365
    }
3735.33.3 by John Arbash Meinel
include_entries_from_hash wasn't properly skipping NULL records.
366
    assert(packed_entry - (struct index_entry *)mem
367
            == num_entries + hsize*EXTRA_NULLS);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
368
    index->last_entry = (packed_entry - 1);
369
    return index;
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
370
}
371
3735.33.4 by John Arbash Meinel
The new layout is working.
372
0.23.49 by John Arbash Meinel
When adding new entries to the delta index, use memcpy
373
struct delta_index *
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
374
create_delta_index(const struct source_info *src,
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
375
                   struct delta_index *old)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
376
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
377
    unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;
378
    unsigned int total_num_entries;
379
    const unsigned char *data, *buffer;
380
    struct delta_index *index;
381
    struct unpacked_index_entry *entry, **hash;
382
    void *mem;
383
    unsigned long memsize;
384
385
    if (!src->buf || !src->size)
386
        return NULL;
387
    buffer = src->buf;
388
389
    /* Determine index hash size.  Note that indexing skips the
390
       first byte to allow for optimizing the Rabin's polynomial
391
       initialization in create_delta(). */
392
    num_entries = (src->size - 1)  / RABIN_WINDOW;
393
    if (old != NULL)
394
        total_num_entries = num_entries + old->num_entries;
395
    else
396
        total_num_entries = num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
397
    hsize = total_num_entries / 4;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
398
    for (i = 4; (1u << i) < hsize && i < 31; i++);
399
    hsize = 1 << i;
400
    hmask = hsize - 1;
3735.33.18 by John Arbash Meinel
*grow* the local hmask if it is smaller than expected, don't *shrink* it.
401
    if (old && old->hash_mask > hmask) {
3735.33.16 by John Arbash Meinel
Handle when our current packing is sub-optimal.
402
        hmask = old->hash_mask;
403
        hsize = hmask + 1;
404
    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
405
406
    /* allocate lookup index */
407
    memsize = sizeof(*hash) * hsize +
408
          sizeof(*entry) * total_num_entries;
409
    mem = malloc(memsize);
410
    if (!mem)
411
        return NULL;
412
    hash = mem;
413
    mem = hash + hsize;
414
    entry = mem;
415
416
    memset(hash, 0, hsize * sizeof(*hash));
417
418
    /* allocate an array to count hash num_entries */
419
    hash_count = calloc(hsize, sizeof(*hash_count));
420
    if (!hash_count) {
421
        free(hash);
422
        return NULL;
423
    }
424
425
    /* then populate the index for the new data */
426
    prev_val = ~0;
427
    for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;
428
         data >= buffer;
429
         data -= RABIN_WINDOW) {
430
        unsigned int val = 0;
431
        for (i = 1; i <= RABIN_WINDOW; i++)
432
            val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
433
        if (val == prev_val) {
434
            /* keep the lowest of consecutive identical blocks */
435
            entry[-1].entry.ptr = data + RABIN_WINDOW;
436
            --num_entries;
437
            --total_num_entries;
438
        } else {
439
            prev_val = val;
440
            i = val & hmask;
441
            entry->entry.ptr = data + RABIN_WINDOW;
442
            entry->entry.val = val;
443
            entry->entry.src = src;
444
            entry->next = hash[i];
445
            hash[i] = entry++;
446
            hash_count[i]++;
447
        }
448
    }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
449
    /* TODO: It would be nice to limit_hash_buckets at a better time. */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
450
    total_num_entries = limit_hash_buckets(hash, hash_count, hsize,
451
                                           total_num_entries);
452
    free(hash_count);
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
453
    if (old) {
454
        old->last_src = src;
455
    }
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
456
    index = pack_delta_index(hash, hsize, total_num_entries, old);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
457
    free(hash);
458
    if (!index) {
459
        return NULL;
460
    }
3735.33.12 by John Arbash Meinel
Now we are able to weave 'add_source' into the existing index.
461
    index->last_src = src;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
462
    return index;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
463
}
464
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
465
/* Take some entries, and put them into a custom hash.
466
 * @param entries   A list of entries, sorted by position in file
467
 * @param num_entries   Length of entries
468
 * @param out_hsize     The maximum size of the hash, the final size will be
469
 *                      returned here
470
 */
471
struct index_entry_linked_list **
472
_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
473
                       unsigned int hsize)
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
474
{
3735.33.11 by John Arbash Meinel
Shave 5s->3.3s in add_source by copying old entries across directly.
475
    unsigned int hash_offset, hmask, memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
476
    struct index_entry *entry, *last_entry;
477
    struct index_entry_linked_list *out_entry, **hash;
478
    void *mem;
479
480
    hmask = hsize - 1;
481
482
    memsize = sizeof(*hash) * hsize +
483
          sizeof(*out_entry) * num_entries;
484
    mem = malloc(memsize);
485
    if (!mem)
486
        return NULL;
487
    hash = mem;
488
    mem = hash + hsize;
489
    out_entry = mem;
490
491
    memset(hash, 0, sizeof(*hash)*(hsize+1));
492
493
    /* We know that entries are in the order we want in the output, but they
494
     * aren't "grouped" by hash bucket yet.
495
     */
496
    last_entry = entries + num_entries;
497
    for (entry = entries + num_entries - 1; entry >= entries; --entry) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
498
        hash_offset = entry->val & hmask;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
499
        out_entry->p_entry = entry;
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
500
        out_entry->next = hash[hash_offset];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
501
        /* TODO: Remove entries that have identical vals, or at least filter
502
         *       the map a little bit.
503
         * if (hash[i] != NULL) {
504
         * }
505
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
506
        hash[hash_offset] = out_entry;
507
        ++out_entry;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
508
    }
509
    return hash;
510
}
511
3735.33.4 by John Arbash Meinel
The new layout is working.
512
513
struct delta_index *
514
create_index_from_old_and_new_entries(const struct delta_index *old_index,
515
                                      struct index_entry *entries,
516
                                      unsigned int num_entries)
517
{
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
518
    unsigned int i, j, hsize, hmask, total_num_entries;
3735.33.4 by John Arbash Meinel
The new layout is working.
519
    struct delta_index *index;
520
    struct index_entry *entry, *packed_entry, **packed_hash;
521
    struct index_entry *last_entry, null_entry = {0};
522
    void *mem;
523
    unsigned long memsize;
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
524
    struct index_entry_linked_list *unpacked_entry, **mini_hash;
3735.33.4 by John Arbash Meinel
The new layout is working.
525
526
    /* Determine index hash size.  Note that indexing skips the
527
       first byte to allow for optimizing the Rabin's polynomial
528
       initialization in create_delta(). */
529
    total_num_entries = num_entries + old_index->num_entries;
3735.33.8 by John Arbash Meinel
Reverted back to the same hash width, and bumped EXTRA_NULLS to 3.
530
    hsize = total_num_entries / 4;
3735.33.4 by John Arbash Meinel
The new layout is working.
531
    for (i = 4; (1u << i) < hsize && i < 31; i++);
532
    hsize = 1 << i;
533
    if (hsize < old_index->hash_mask) {
534
        /* For some reason, there was a code path that would actually *shrink*
535
         * the hash size. This screws with some later code, and in general, I
536
         * think it better to make the hash bigger, rather than smaller. So
537
         * we'll just force the size here.
538
         * Possibly done by create_delta_index running into a
539
         * limit_hash_buckets call, that ended up transitioning across a
540
         * power-of-2. The cause isn't 100% clear, though.
541
         */
542
        hsize = old_index->hash_mask + 1;
543
    }
544
    hmask = hsize - 1;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
545
    // fprintf(stderr, "resizing index to insert %d entries into array"
546
    //                 " with %d entries: %x => %x\n",
547
    //         num_entries, old_index->num_entries, old_index->hash_mask, hmask);
3735.33.4 by John Arbash Meinel
The new layout is working.
548
549
    memsize = sizeof(*index)
550
        + sizeof(*packed_hash) * (hsize+1)
551
        + sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);
552
    mem = malloc(memsize);
553
    if (!mem) {
554
        return NULL;
555
    }
556
    index = mem;
557
    index->memsize = memsize;
558
    index->hash_mask = hmask;
559
    index->num_entries = total_num_entries;
560
    index->last_src = old_index->last_src;
561
562
    mem = index->hash;
563
    packed_hash = mem;
564
    mem = packed_hash + (hsize+1);
565
    packed_entry = mem;
566
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
567
    mini_hash = _put_entries_into_hash(entries, num_entries, hsize);
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
568
    if (mini_hash == NULL) {
569
        free(index);
570
        return NULL;
571
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
572
    last_entry = entries + num_entries;
573
    for (i = 0; i < hsize; i++) {
574
        /*
575
         * Coalesce all entries belonging in one hash bucket
576
         * into consecutive array entries.
577
         * The entries in old_index all come before 'entries'.
578
         */
579
        packed_hash[i] = packed_entry;
580
        /* Copy any of the old entries across */
581
        /* Would we rather use memcpy? */
582
        if (hmask == old_index->hash_mask) {
583
            for (entry = old_index->hash[i];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
584
                 entry < old_index->hash[i+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
585
                 ++entry) {
586
                assert((entry->val & hmask) == i);
587
                *packed_entry++ = *entry;
588
            }
589
        } else {
590
            /* If we resized the index from this action, all of the old values
591
             * will be found in the previous location, but they will end up
592
             * spread across the new locations.
593
             */
594
            j = i & old_index->hash_mask;
595
            for (entry = old_index->hash[j];
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
596
                 entry < old_index->hash[j+1] && entry->ptr != NULL;
3735.33.4 by John Arbash Meinel
The new layout is working.
597
                 ++entry) {
598
                assert((entry->val & old_index->hash_mask) == j);
599
                if ((entry->val & hmask) == i) {
600
                    /* Any entries not picked up here will be picked up on the
601
                     * next pass.
602
                     */
603
                    *packed_entry++ = *entry;
604
                }
605
            }
606
        }
607
        /* Now see if we need to insert any of the new entries.
608
         * Note that loop ends up O(hsize*num_entries), so we expect that
609
         * num_entries is always small.
610
         * We also help a little bit by collapsing the entry range when the
611
         * endpoints are inserted. However, an alternative would be to build a
612
         * quick hash lookup for just the new entries.
613
         * Testing shows that this list can easily get up to about 100
614
         * entries, the tradeoff is a malloc, 1 pass over the entries, copying
615
         * them into a sorted buffer, and a free() when done,
616
         */
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
617
        for (unpacked_entry = mini_hash[i];
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
618
             unpacked_entry;
619
             unpacked_entry = unpacked_entry->next) {
3735.33.10 by John Arbash Meinel
Fix a bug when there is more than one entry (increment out_entry).
620
            assert((unpacked_entry->p_entry->val & hmask) == i);
621
            *packed_entry++ = *(unpacked_entry->p_entry);
3735.33.4 by John Arbash Meinel
The new layout is working.
622
        }
623
        /* Now insert some extra nulls */
624
        for (j = 0; j < EXTRA_NULLS; ++j) {
625
            *packed_entry++ = null_entry;
626
        }
627
    }
3735.33.9 by John Arbash Meinel
When expanding an index put the entries into a hash.
628
    free(mini_hash);
3735.33.4 by John Arbash Meinel
The new layout is working.
629
630
    /* Sentinel value to indicate the length of the last hash bucket */
631
    packed_hash[hsize] = packed_entry;
632
633
    if ((packed_entry - (struct index_entry *)mem)
634
        != (total_num_entries + hsize*EXTRA_NULLS)) {
635
        fprintf(stderr, "We expected %d entries, but created %d\n",
636
                total_num_entries + hsize*EXTRA_NULLS,
637
                (int)(packed_entry - (struct index_entry*)mem));
638
        fflush(stderr);
639
    }
640
    assert((packed_entry - (struct index_entry *)mem)
641
           == (total_num_entries + hsize * EXTRA_NULLS));
642
    index->last_entry = (packed_entry - 1);
643
    return index;
644
}
645
646
647
void
648
get_text(char buff[128], const unsigned char *ptr)
649
{
650
    unsigned int i;
651
    const unsigned char *start;
652
    unsigned char cmd;
653
    start = (ptr-RABIN_WINDOW-1);
654
    cmd = *(start);
655
    if (cmd < 0x80) {// This is likely to be an insert instruction
656
        if (cmd < RABIN_WINDOW) {
657
            cmd = RABIN_WINDOW;
658
        }
659
    } else {
660
        /* This was either a copy [should never be] or it
661
         * was a longer insert so the insert start happened at 16 more
662
         * bytes back.
663
         */
664
        cmd = RABIN_WINDOW + 1;
665
    }
666
    if (cmd > 60) {
667
        cmd = 60; /* Be friendly to 80char terms */
668
    }
669
    /* Copy the 1 byte command, and 4 bytes after the insert */
670
    cmd += 5;
671
    memcpy(buff, start, cmd);
672
    buff[cmd] = 0;
673
    for (i = 0; i < cmd; ++i) {
674
        if (buff[i] == '\n') {
675
            buff[i] = 'N';
676
        } else if (buff[i] == '\t') {
677
            buff[i] = 'T';
678
        }
679
    }
680
}
681
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
682
struct delta_index *
683
create_delta_index_from_delta(const struct source_info *src,
3735.33.4 by John Arbash Meinel
The new layout is working.
684
                              struct delta_index *old_index)
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
685
{
3735.33.4 by John Arbash Meinel
The new layout is working.
686
    unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;
687
    unsigned int hash_offset;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
688
    const unsigned char *data, *buffer, *top;
689
    unsigned char cmd;
3735.33.4 by John Arbash Meinel
The new layout is working.
690
    struct delta_index *new_index;
691
    struct index_entry *entry, *entries, *old_entry;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
692
693
    if (!src->buf || !src->size)
694
        return NULL;
695
    buffer = src->buf;
696
    top = buffer + src->size;
697
698
    /* Determine index hash size.  Note that indexing skips the
699
       first byte to allow for optimizing the Rabin's polynomial
700
       initialization in create_delta().
701
       This computes the maximum number of entries that could be held. The
702
       actual number will be recomputed during processing.
703
       */
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
704
3735.33.4 by John Arbash Meinel
The new layout is working.
705
    max_num_entries = (src->size - 1)  / RABIN_WINDOW;
706
707
    /* allocate an array to hold whatever entries we find */
708
    entries = malloc(sizeof(*entry) * max_num_entries);
709
    if (!entries) /* malloc failure */
710
        return NULL;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
711
712
    /* then populate the index for the new data */
713
    prev_val = ~0;
714
    data = buffer;
715
    /* target size */
716
    get_delta_hdr_size(&data, top);
3735.33.4 by John Arbash Meinel
The new layout is working.
717
    entry = entries; /* start at the first slot */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
718
    num_entries = 0; /* calculate the real number of entries */
719
    while (data < top) {
720
        cmd = *data++;
721
        if (cmd & 0x80) {
722
            /* Copy instruction, skip it */
723
            if (cmd & 0x01) data++;
724
            if (cmd & 0x02) data++;
725
            if (cmd & 0x04) data++;
726
            if (cmd & 0x08) data++;
727
            if (cmd & 0x10) data++;
728
            if (cmd & 0x20) data++;
729
            if (cmd & 0x40) data++;
730
        } else if (cmd) {
731
            /* Insert instruction, we want to index these bytes */
732
            if (data + cmd > top) {
733
                /* Invalid insert, not enough bytes in the delta */
734
                break;
735
            }
3735.33.5 by John Arbash Meinel
Restoring the debugging for now.
736
            /* The create_delta code requires a match at least 4 characters
737
             * (including only the last char of the RABIN_WINDOW) before it
738
             * will consider it something worth copying rather than inserting.
739
             * So we don't want to index anything that we know won't ever be a
740
             * match.
741
             */
742
            for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
743
                                       data += RABIN_WINDOW) {
744
                unsigned int val = 0;
745
                for (i = 1; i <= RABIN_WINDOW; i++)
746
                    val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
747
                if (val != prev_val) {
748
                    /* Only keep the first of consecutive data */
749
                    prev_val = val;
3735.33.4 by John Arbash Meinel
The new layout is working.
750
                    num_entries++;
751
                    entry->ptr = data + RABIN_WINDOW;
752
                    entry->val = val;
753
                    entry->src = src;
3735.33.2 by John Arbash Meinel
Revert some of the previous work.
754
                    entry++;
3735.33.4 by John Arbash Meinel
The new layout is working.
755
                    if (num_entries > max_num_entries) {
756
                        /* We ran out of entry room, something is really wrong
757
                         */
758
                        break;
759
                    }
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
760
                }
761
            }
762
            /* Move the data pointer by whatever remainder is left */
763
            data += cmd;
764
        } else {
765
            /*
766
             * cmd == 0 is reserved for future encoding
767
             * extensions. In the mean time we must fail when
768
             * encountering them (might be data corruption).
769
             */
770
            break;
771
        }
772
    }
773
    if (data != top) {
774
        /* Something was wrong with this delta */
3735.33.4 by John Arbash Meinel
The new layout is working.
775
        free(entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
776
        return NULL;
777
    }
778
    if (num_entries == 0) {
779
        /** Nothing to index **/
3735.33.4 by John Arbash Meinel
The new layout is working.
780
        free(entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
781
        return NULL;
782
    }
3735.33.4 by John Arbash Meinel
The new layout is working.
783
    assert(old_index != NULL);
784
    old_index->last_src = src;
785
    /* See if we can fill in these values into the holes in the array */
786
    entry = entries;
787
    num_inserted = 0;
788
    for (; num_entries > 0; --num_entries, ++entry) {
789
        hash_offset = (entry->val & old_index->hash_mask);
790
        /* The basic structure is a hash => packed_entries that fit in that
791
         * hash bucket. Things are structured such that the hash-pointers are
792
         * strictly ordered. So we start by pointing to the next pointer, and
793
         * walk back until we stop getting NULL targets, and then go back
794
         * forward. If there are no NULL targets, then we know because
795
         * entry->ptr will not be NULL.
796
         */
797
        old_entry = old_index->hash[hash_offset + 1];
798
        old_entry--;
799
        while (old_entry->ptr == NULL
800
               && old_entry >= old_index->hash[hash_offset]) {
801
            old_entry--;
802
        }
803
        old_entry++;
804
        if (old_entry->ptr != NULL
805
            || old_entry >= old_index->hash[hash_offset + 1]) {
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
806
            /* There is no room for this entry, we have to resize */
807
            // char buff[128];
808
            // get_text(buff, entry->ptr);
809
            // fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",
810
            //         hash_offset, entry->val, buff);
811
            // for (old_entry = old_index->hash[hash_offset];
812
            //      old_entry < old_index->hash[hash_offset+1];
813
            //      ++old_entry) {
814
            //     get_text(buff, old_entry->ptr);
815
            //     fprintf(stderr, "  [%2d] %8x %8x: '%s'\n",
816
            //             (int)(old_entry - old_index->hash[hash_offset]),
817
            //             old_entry->val, old_entry->ptr, buff);
818
            // }
3735.33.4 by John Arbash Meinel
The new layout is working.
819
            break;
820
        }
821
        num_inserted++;
822
        *old_entry = *entry;
823
        /* For entries which we *do* manage to insert into old_index, we don't
824
         * want them double copied into the final output.
825
         */
826
        old_index->num_entries++;
827
    }
828
    if (num_entries > 0) {
829
        /* We couldn't fit the new entries into the old index, so allocate a
830
         * new one, and fill it with stuff.
831
         */
3735.33.6 by John Arbash Meinel
Increasing EXTRA_NULLS to 2 from 1 ups the hit rate
832
        // fprintf(stderr, "inserted %d before resize\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
833
        new_index = create_index_from_old_and_new_entries(old_index,
834
            entry, num_entries);
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
835
    } else {
3735.33.4 by John Arbash Meinel
The new layout is working.
836
        new_index = NULL;
3735.33.15 by John Arbash Meinel
Remove the noisy debugging code. (down to 23.1s)
837
        // fprintf(stderr, "inserted %d without resizing\n", num_inserted);
3735.33.4 by John Arbash Meinel
The new layout is working.
838
    }
839
    free(entries);
840
    return new_index;
0.23.45 by John Arbash Meinel
Add a function that updates the index for delta bytes.
841
}
842
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
843
void free_delta_index(struct delta_index *index)
844
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
845
    free(index);
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
846
}
847
848
unsigned long sizeof_delta_index(struct delta_index *index)
849
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
850
    if (index)
851
        return index->memsize;
852
    else
853
        return 0;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
854
}
855
856
/*
857
 * The maximum size for any opcode sequence, including the initial header
858
 * plus Rabin window plus biggest copy.
859
 */
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
860
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
861
862
void *
0.23.44 by John Arbash Meinel
Remove the multi-index handling now that we have index combining instead.
863
create_delta(const struct delta_index *index,
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
864
             const void *trg_buf, unsigned long trg_size,
865
             unsigned long *delta_size, unsigned long max_size)
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
866
{
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
867
    unsigned int i, outpos, outsize, moff, msize, val;
868
    const struct source_info *msource;
869
    int inscnt;
870
    const unsigned char *ref_data, *ref_top, *data, *top;
871
    unsigned char *out;
872
    unsigned long source_size;
873
874
    if (!trg_buf || !trg_size)
875
        return NULL;
876
    if (index == NULL)
877
        return NULL;
878
879
    outpos = 0;
880
    outsize = 8192;
881
    if (max_size && outsize >= max_size)
882
        outsize = max_size + MAX_OP_SIZE + 1;
883
    out = malloc(outsize);
884
    if (!out)
885
        return NULL;
886
887
    source_size = index->last_src->size + index->last_src->agg_offset;
888
889
    /* store target buffer size */
890
    i = trg_size;
891
    while (i >= 0x80) {
892
        out[outpos++] = i | 0x80;
893
        i >>= 7;
894
    }
895
    out[outpos++] = i;
896
897
    data = trg_buf;
898
    top = (const unsigned char *) trg_buf + trg_size;
899
900
    /* Start the matching by filling out with a simple 'insert' instruction, of
901
     * the first RABIN_WINDOW bytes of the input.
902
     */
903
    outpos++; /* leave a byte for the insert command */
904
    val = 0;
905
    for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
906
        out[outpos++] = *data;
907
        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
908
    }
909
    /* we are now setup with an insert of 'i' bytes and val contains the RABIN
910
     * hash for those bytes, and data points to the RABIN_WINDOW+1 byte of
911
     * input.
912
     */
913
    inscnt = i;
914
915
    moff = 0;
916
    msize = 0;
917
    msource = NULL;
918
    while (data < top) {
919
        if (msize < 4096) {
920
            /* we don't have a 'worthy enough' match yet, so let's look for
921
             * one.
922
             */
923
            struct index_entry *entry;
924
            /* Shift the window by one byte. */
925
            val ^= U[data[-RABIN_WINDOW]];
926
            val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
927
            i = val & index->hash_mask;
928
            /* TODO: When using multiple indexes like this, the hash tables
929
             *       mapping val => index_entry become less efficient.
930
             *       You end up getting a lot more collisions in the hash,
931
             *       which doesn't actually lead to a entry->val match.
932
             */
3735.33.1 by John Arbash Meinel
(broken, in progress), change the data structures to allow for inserting small deltas.
933
            for (entry = index->hash[i];
934
                 entry < index->hash[i+1] && entry->src != NULL;
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
935
                 entry++) {
936
                const unsigned char *ref;
937
                const unsigned char *src;
938
                unsigned int ref_size;
939
                if (entry->val != val)
940
                    continue;
941
                ref = entry->ptr;
942
                src = data;
943
                ref_data = entry->src->buf;
944
                ref_top = ref_data + entry->src->size;
945
                ref_size = ref_top - ref;
946
                /* ref_size is the longest possible match that we could make
947
                 * here. If ref_size <= msize, then we know that we cannot
948
                 * match more bytes with this location that we have already
949
                 * matched.
950
                 */
951
                if (ref_size > top - src)
952
                    ref_size = top - src;
953
                if (ref_size <= msize)
954
                    break;
955
                /* See how many bytes actually match at this location. */
956
                while (ref_size-- && *src++ == *ref)
957
                    ref++;
958
                if (msize < ref - entry->ptr) {
959
                    /* this is our best match so far */
960
                    msize = ref - entry->ptr;
961
                    msource = entry->src;
962
                    moff = entry->ptr - ref_data;
963
                    if (msize >= 4096) /* good enough */
964
                        break;
965
                }
966
            }
967
        }
968
969
        if (msize < 4) {
970
            /* The best match right now is less than 4 bytes long. So just add
971
             * the current byte to the insert instruction. Increment the insert
972
             * counter, and copy the byte of data into the output buffer.
973
             */
974
            if (!inscnt)
975
                outpos++;
976
            out[outpos++] = *data++;
977
            inscnt++;
978
            if (inscnt == 0x7f) {
979
                /* We have a max length insert instruction, finalize it in the
980
                 * output.
981
                 */
982
                out[outpos - inscnt - 1] = inscnt;
983
                inscnt = 0;
984
            }
985
            msize = 0;
986
        } else {
987
            unsigned int left;
988
            unsigned char *op;
989
990
            if (inscnt) {
991
                ref_data = msource->buf;
992
                while (moff && ref_data[moff-1] == data[-1]) {
993
                    /* we can match one byte back */
994
                    msize++;
995
                    moff--;
996
                    data--;
997
                    outpos--;
998
                    if (--inscnt)
999
                        continue;
1000
                    outpos--;  /* remove count slot */
1001
                    inscnt--;  /* make it -1 */
1002
                    break;
1003
                }
1004
                out[outpos - inscnt - 1] = inscnt;
1005
                inscnt = 0;
1006
            }
1007
1008
            /* A copy op is currently limited to 64KB (pack v2) */
1009
            left = (msize < 0x10000) ? 0 : (msize - 0x10000);
1010
            msize -= left;
1011
1012
            op = out + outpos++;
1013
            i = 0x80;
1014
1015
            /* moff is the offset in the local structure, for encoding, we need
1016
             * to push it into the global offset
1017
             */
1018
            assert(moff < msource->size);
1019
            moff += msource->agg_offset;
1020
            assert(moff + msize <= source_size);
1021
            if (moff & 0x000000ff)
1022
                out[outpos++] = moff >> 0,  i |= 0x01;
1023
            if (moff & 0x0000ff00)
1024
                out[outpos++] = moff >> 8,  i |= 0x02;
1025
            if (moff & 0x00ff0000)
1026
                out[outpos++] = moff >> 16, i |= 0x04;
1027
            if (moff & 0xff000000)
1028
                out[outpos++] = moff >> 24, i |= 0x08;
1029
            /* Put it back into local coordinates, in case we have multiple
1030
             * copies in a row.
1031
             */
1032
            moff -= msource->agg_offset;
1033
1034
            if (msize & 0x00ff)
1035
                out[outpos++] = msize >> 0, i |= 0x10;
1036
            if (msize & 0xff00)
1037
                out[outpos++] = msize >> 8, i |= 0x20;
1038
1039
            *op = i;
1040
1041
            data += msize;
1042
            moff += msize;
1043
            msize = left;
1044
1045
            if (msize < 4096) {
1046
                int j;
1047
                val = 0;
1048
                for (j = -RABIN_WINDOW; j < 0; j++)
1049
                    val = ((val << 8) | data[j])
1050
                          ^ T[val >> RABIN_SHIFT];
1051
            }
1052
        }
1053
1054
        if (outpos >= outsize - MAX_OP_SIZE) {
1055
            void *tmp = out;
1056
            outsize = outsize * 3 / 2;
1057
            if (max_size && outsize >= max_size)
1058
                outsize = max_size + MAX_OP_SIZE + 1;
1059
            if (max_size && outpos > max_size)
1060
                break;
1061
            out = realloc(out, outsize);
1062
            if (!out) {
1063
                free(tmp);
1064
                return NULL;
1065
            }
1066
        }
1067
    }
1068
1069
    if (inscnt)
1070
        out[outpos - inscnt - 1] = inscnt;
1071
1072
    if (max_size && outpos > max_size) {
1073
        free(out);
1074
        return NULL;
1075
    }
1076
1077
    *delta_size = outpos;
1078
    return out;
0.23.2 by Nicolas Pitre
Add the diff-delta.c and patch-delta.c files.
1079
}
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
1080
0.23.57 by John Arbash Meinel
Change the formatting, replace \t with spaces to be consistent with bzr coding.
1081
/* vim: et ts=4 sw=4 sts=4
0.23.36 by John Arbash Meinel
Track down a memory leak in the refactored diff-delta.c code.
1082
 */