~bzr-pqm/bzr/bzr.dev : revision 4241.6.6

1

/*

2

* diff-delta.c: generate a delta between two buffers

3

*

4

* This code was greatly inspired by parts of LibXDiff from Davide Libenzi

5

* http://www.xmailserver.org/xdiff-lib.html

6

*

7

8

9

*

10

* This program is free software; you can redistribute it and/or modify

11

* it under the terms of the GNU General Public License as published by

12

* the Free Software Foundation; either version 2 of the License, or

13

* (at your option) any later version.

14

*

15

* NB: The version in GIT is 'version 2 of the Licence only', however Nicolas

16

* has granted permission for use under 'version 2 or later' in private email

17

* to Robert Collins and Karl Fogel on the 6th April 2009.

18

*/

19

20

#include <stdio.h>

21

22

#include "delta.h"

23

#include <stdlib.h>

24

#include <string.h>

25

#include <assert.h>

26

27

/* maximum hash entry list for the same hash bucket */

28

#define HASH_LIMIT 64

29

30

#define RABIN_SHIFT 23

31

#define RABIN_WINDOW 16

32

33

/* The hash map is sized to put 4 entries per bucket, this gives us ~even room

34

* for more data. Tweaking this number above 4 doesn't seem to help much,

35

* anyway.

36

*/

37

#define EXTRA_NULLS 4

38

39

static const unsigned int T[256] = {

40

0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,

41

0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,

42

0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,

43

0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,

44

0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,

45

0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,

46

0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,

47

0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,

48

0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,

49

0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,

50

0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,

51

0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,

52

0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,

53

0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,

54

0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,

55

0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,

56

0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,

57

0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,

58

0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,

59

0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,

60

0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,

61

0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,

62

0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,

63

0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,

64

0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,

65

0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,

66

0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,

67

0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,

68

0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,

69

0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,

70

0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,

71

0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,

72

0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,

73

0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,

74

0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,

75

0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,

76

0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,

77

0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,

78

0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,

79

0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,

80

0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,

81

0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,

82

0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811

83

};

84

85

static const unsigned int U[256] = {

86

0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,

87

0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,

88

0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,

89

0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,

90

0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,

91

0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,

92

0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,

93

0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,

94

0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,

95

0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,

96

0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,

97

0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,

98

0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,

99

0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,

100

0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,

101

0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,

102

0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,

103

0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,

104

0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,

105

0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,

106

0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,

107

0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,

108

0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,

109

0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,

110

0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,

111

0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,

112

0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,

113

0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,

114

0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,

115

0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,

116

0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,

117

0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,

118

0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,

119

0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,

120

0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,

121

0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,

122

0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,

123

0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,

124

0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,

125

0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,

126

0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,

127

0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,

128

0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a

129

};

130

131

struct index_entry {

132

const unsigned char *ptr;

133

const struct source_info *src;

134

unsigned int val;

135

};

136

137

struct index_entry_linked_list {

138

struct index_entry *p_entry;

139

struct index_entry_linked_list *next;

140

};

141

142

struct unpacked_index_entry {

143

struct index_entry entry;

144

struct unpacked_index_entry *next;

145

};

146

147

struct delta_index {

148

unsigned long memsize; /* Total bytes pointed to by this index */

149

const struct source_info *last_src; /* Information about the referenced source */

150

unsigned int hash_mask; /* val & hash_mask gives the hash index for a given

151

entry */

152

unsigned int num_entries; /* The total number of entries in this index */

153

struct index_entry *last_entry; /* Pointer to the last valid entry */

154

struct index_entry *hash[];

155

};

156

157

static unsigned int

158

limit_hash_buckets(struct unpacked_index_entry **hash,

159

unsigned int *hash_count, unsigned int hsize,

160

unsigned int entries)

161

{

162

struct unpacked_index_entry *entry;

163

unsigned int i;

164

/*

165

* Determine a limit on the number of entries in the same hash

166

* bucket. This guards us against pathological data sets causing

167

* really bad hash distribution with most entries in the same hash

168

* bucket that would bring us to O(m*n) computing costs (m and n

169

* corresponding to reference and target buffer sizes).

170

*

171

* Make sure none of the hash buckets has more entries than

172

* we're willing to test. Otherwise we cull the entry list

173

* uniformly to still preserve a good repartition across

174

* the reference buffer.

175

*/

176

for (i = 0; i < hsize; i++) {

177

int acc;

178

179

if (hash_count[i] <= HASH_LIMIT)

180

continue;

181

182

/* We leave exactly HASH_LIMIT entries in the bucket */

183

entries -= hash_count[i] - HASH_LIMIT;

184

185

entry = hash[i];

186

acc = 0;

187

188

/*

189

* Assume that this loop is gone through exactly

190

* HASH_LIMIT times and is entered and left with

191

* acc==0. So the first statement in the loop

192

* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT

193

* to the accumulator, and the inner loop consequently

194

* is run (hash_count[i]-HASH_LIMIT) times, removing

195

* one element from the list each time. Since acc

196

* balances out to 0 at the final run, the inner loop

197

* body can't be left with entry==NULL. So we indeed

198

* encounter entry==NULL in the outer loop only.

199

*/

200

do {

201

acc += hash_count[i] - HASH_LIMIT;

202

if (acc > 0) {

203

struct unpacked_index_entry *keep = entry;

204

do {

205

entry = entry->next;

206

acc -= HASH_LIMIT;

207

} while (acc > 0);

208

keep->next = entry->next;

209

}

210

entry = entry->next;

211

} while (entry);

212

}

213

return entries;

214

}

215

216

static struct delta_index *

217

pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,

218

unsigned int num_entries, struct delta_index *old_index)

219

{

220

unsigned int i, j, hmask, memsize, fit_in_old, copied_count;

221

struct unpacked_index_entry *entry;

222

struct delta_index *index;

223

struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;

224

struct index_entry null_entry = {0};

225

void *mem;

226

227

hmask = hsize - 1;

228

229

// if (old_index) {

230

// fprintf(stderr, "Packing %d entries into %d for total of %d entries"

231

// " %x => %x\n",

232

// num_entries - old_index->num_entries,

233

// old_index->num_entries, num_entries,

234

// old_index->hash_mask, hmask);

235

// } else {

236

// fprintf(stderr, "Packing %d entries into a new index\n",

237

// num_entries);

238

// }

239

/* First, see if we can squeeze the new items into the existing structure.

240

*/

241

fit_in_old = 0;

242

copied_count = 0;

243

if (old_index && old_index->hash_mask == hmask) {

244

fit_in_old = 1;

245

for (i = 0; i < hsize; ++i) {

246

packed_entry = NULL;

247

for (entry = hash[i]; entry; entry = entry->next) {

248

if (packed_entry == NULL) {

249

/* Find the last open spot */

250

packed_entry = old_index->hash[i + 1];

251

--packed_entry;

252

while (packed_entry >= old_index->hash[i]

253

&& packed_entry->ptr == NULL) {

254

--packed_entry;

255

}

256

++packed_entry;

257

}

258

if (packed_entry >= old_index->hash[i+1]

259

|| packed_entry->ptr != NULL) {

260

/* There are no free spots here :( */

261

fit_in_old = 0;

262

break;

263

}

264

/* We found an empty spot to put this entry

265

* Copy it over, and remove it from the linked list, just in

266

* case we end up running out of room later.

267

*/

268

*packed_entry++ = entry->entry;

269

assert(entry == hash[i]);

270

hash[i] = entry->next;

271

copied_count += 1;

272

old_index->num_entries++;

273

}

274

if (!fit_in_old) {

275

break;

276

}

277

}

278

}

279

if (old_index) {

280

if (fit_in_old) {

281

// fprintf(stderr, "Fit all %d entries into old index\n",

282

// copied_count);

283

/* No need to allocate a new buffer */

284

return NULL;

285

} else {

286

// fprintf(stderr, "Fit only %d entries into old index,"

287

// " reallocating\n", copied_count);

288

}

289

}

290

/*

291

* Now create the packed index in array form

292

* rather than linked lists.

293

* Leave a 2-entry gap for inserting more entries between the groups

294

*/

295

memsize = sizeof(*index)

296

+ sizeof(*packed_hash) * (hsize+1)

297

+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);

298

mem = malloc(memsize);

299

if (!mem) {

300

return NULL;

301

}

302

303

index = mem;

304

index->memsize = memsize;

305

index->hash_mask = hmask;

306

index->num_entries = num_entries;

307

if (old_index) {

308

if (hmask < old_index->hash_mask) {

309

fprintf(stderr, "hash mask was shrunk %x => %x\n",

310

old_index->hash_mask, hmask);

311

}

312

assert(hmask >= old_index->hash_mask);

313

}

314

315

mem = index->hash;

316

packed_hash = mem;

317

mem = packed_hash + (hsize+1);

318

packed_entry = mem;

319

320

for (i = 0; i < hsize; i++) {

321

/*

322

* Coalesce all entries belonging to one linked list

323

* into consecutive array entries.

324

*/

325

packed_hash[i] = packed_entry;

326

/* Old comes earlier as a source, so it always comes first in a given

327

* hash bucket.

328

*/

329

if (old_index) {

330

/* Could we optimize this to use memcpy when hmask ==

331

* old_index->hash_mask? Would it make any real difference?

332

*/

333

j = i & old_index->hash_mask;

334

copy_from = old_index->hash[j];

335

for (old_entry = old_index->hash[j];

336

old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;

337

old_entry++) {

338

if ((old_entry->val & hmask) == i) {

339

*packed_entry++ = *old_entry;

340

}

341

}

342

}

343

for (entry = hash[i]; entry; entry = entry->next) {

344

*packed_entry++ = entry->entry;

345

}

346

/* TODO: At this point packed_entry - packed_hash[i] is the number of

347

* records that we have inserted into this hash bucket.

348

* We should *really* consider doing some limiting along the

349

* lines of limit_hash_buckets() to avoid pathological behavior.

350

*/

351

/* Now add extra 'NULL' entries that we can use for future expansion. */

352

for (j = 0; j < EXTRA_NULLS; ++j ) {

353

*packed_entry++ = null_entry;

354

}

355

}

356

357

/* Sentinel value to indicate the length of the last hash bucket */

358

packed_hash[hsize] = packed_entry;

359

360

if (packed_entry - (struct index_entry *)mem

361

!= num_entries + hsize*EXTRA_NULLS) {

362

fprintf(stderr, "We expected %d entries, but created %d\n",

363

num_entries + hsize*EXTRA_NULLS,

364

(int)(packed_entry - (struct index_entry*)mem));

365

}

366

assert(packed_entry - (struct index_entry *)mem

367

== num_entries + hsize*EXTRA_NULLS);

368

index->last_entry = (packed_entry - 1);

369

return index;

370

}

371

372

373

struct delta_index *

374

create_delta_index(const struct source_info *src,

375

struct delta_index *old)

376

{

377

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

378

unsigned int total_num_entries;

379

const unsigned char *data, *buffer;

380

struct delta_index *index;

381

struct unpacked_index_entry *entry, **hash;

382

void *mem;

383

unsigned long memsize;

384

385

if (!src->buf || !src->size)

386

return NULL;

387

buffer = src->buf;

388

389

/* Determine index hash size. Note that indexing skips the

390

first byte to allow for optimizing the Rabin's polynomial

391

initialization in create_delta(). */

392

num_entries = (src->size - 1) / RABIN_WINDOW;

393

if (old != NULL)

394

total_num_entries = num_entries + old->num_entries;

395

else

396

total_num_entries = num_entries;

397

hsize = total_num_entries / 4;

398

for (i = 4; (1u << i) < hsize && i < 31; i++);

399

hsize = 1 << i;

400

hmask = hsize - 1;

401

if (old && old->hash_mask > hmask) {

402

hmask = old->hash_mask;

403

hsize = hmask + 1;

404

}

405

406

/* allocate lookup index */

407

memsize = sizeof(*hash) * hsize +

408

sizeof(*entry) * total_num_entries;

409

mem = malloc(memsize);

410

if (!mem)

411

return NULL;

412

hash = mem;

413

mem = hash + hsize;

414

entry = mem;

415

416

memset(hash, 0, hsize * sizeof(*hash));

417

418

/* allocate an array to count hash num_entries */

419

hash_count = calloc(hsize, sizeof(*hash_count));

420

if (!hash_count) {

421

free(hash);

422

return NULL;

423

}

424

425

/* then populate the index for the new data */

426

prev_val = ~0;

427

for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;

428

data >= buffer;

429

data -= RABIN_WINDOW) {

430

unsigned int val = 0;

431

for (i = 1; i <= RABIN_WINDOW; i++)

432

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

433

if (val == prev_val) {

434

/* keep the lowest of consecutive identical blocks */

435

entry[-1].entry.ptr = data + RABIN_WINDOW;

436

--num_entries;

437

--total_num_entries;

438

} else {

439

prev_val = val;

440

i = val & hmask;

441

entry->entry.ptr = data + RABIN_WINDOW;

442

entry->entry.val = val;

443

entry->entry.src = src;

444

entry->next = hash[i];

445

hash[i] = entry++;

446

hash_count[i]++;

447

}

448

}

449

/* TODO: It would be nice to limit_hash_buckets at a better time. */

450

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

451

total_num_entries);

452

free(hash_count);

453

if (old) {

454

old->last_src = src;

455

}

456

index = pack_delta_index(hash, hsize, total_num_entries, old);

457

free(hash);

458

if (!index) {

459

return NULL;

460

}

461

index->last_src = src;

462

return index;

463

}

464

465

/* Take some entries, and put them into a custom hash.

466

* @param entries A list of entries, sorted by position in file

467

* @param num_entries Length of entries

468

* @param out_hsize The maximum size of the hash, the final size will be

469

* returned here

470

*/

471

struct index_entry_linked_list **

472

_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,

473

unsigned int hsize)

474

{

475

unsigned int hash_offset, hmask, memsize;

476

struct index_entry *entry, *last_entry;

477

struct index_entry_linked_list *out_entry, **hash;

478

void *mem;

479

480

hmask = hsize - 1;

481

482

memsize = sizeof(*hash) * hsize +

483

sizeof(*out_entry) * num_entries;

484

mem = malloc(memsize);

485

if (!mem)

486

return NULL;

487

hash = mem;

488

mem = hash + hsize;

489

out_entry = mem;

490

491

memset(hash, 0, sizeof(*hash)*(hsize+1));

492

493

/* We know that entries are in the order we want in the output, but they

494

* aren't "grouped" by hash bucket yet.

495

*/

496

last_entry = entries + num_entries;

497

for (entry = entries + num_entries - 1; entry >= entries; --entry) {

498

hash_offset = entry->val & hmask;

499

out_entry->p_entry = entry;

500

out_entry->next = hash[hash_offset];

501

/* TODO: Remove entries that have identical vals, or at least filter

502

* the map a little bit.

503

* if (hash[i] != NULL) {

504

* }

505

*/

506

hash[hash_offset] = out_entry;

507

++out_entry;

508

}

509

return hash;

510

}

511

512

513

struct delta_index *

514

create_index_from_old_and_new_entries(const struct delta_index *old_index,

515

struct index_entry *entries,

516

unsigned int num_entries)

517

{

518

unsigned int i, j, hsize, hmask, total_num_entries;

519

struct delta_index *index;

520

struct index_entry *entry, *packed_entry, **packed_hash;

521

struct index_entry *last_entry, null_entry = {0};

522

void *mem;

523

unsigned long memsize;

524

struct index_entry_linked_list *unpacked_entry, **mini_hash;

525

526

/* Determine index hash size. Note that indexing skips the

527

first byte to allow for optimizing the Rabin's polynomial

528

initialization in create_delta(). */

529

total_num_entries = num_entries + old_index->num_entries;

530

hsize = total_num_entries / 4;

531

for (i = 4; (1u << i) < hsize && i < 31; i++);

532

hsize = 1 << i;

533

if (hsize < old_index->hash_mask) {

534

/* For some reason, there was a code path that would actually *shrink*

535

* the hash size. This screws with some later code, and in general, I

536

* think it better to make the hash bigger, rather than smaller. So

537

* we'll just force the size here.

538

* Possibly done by create_delta_index running into a

539

* limit_hash_buckets call, that ended up transitioning across a

540

* power-of-2. The cause isn't 100% clear, though.

541

*/

542

hsize = old_index->hash_mask + 1;

543

}

544

hmask = hsize - 1;

545

// fprintf(stderr, "resizing index to insert %d entries into array"

546

// " with %d entries: %x => %x\n",

547

// num_entries, old_index->num_entries, old_index->hash_mask, hmask);

548

549

memsize = sizeof(*index)

550

+ sizeof(*packed_hash) * (hsize+1)

551

+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);

552

mem = malloc(memsize);

553

if (!mem) {

554

return NULL;

555

}

556

index = mem;

557

index->memsize = memsize;

558

index->hash_mask = hmask;

559

index->num_entries = total_num_entries;

560

index->last_src = old_index->last_src;

561

562

mem = index->hash;

563

packed_hash = mem;

564

mem = packed_hash + (hsize+1);

565

packed_entry = mem;

566

567

mini_hash = _put_entries_into_hash(entries, num_entries, hsize);

568

if (mini_hash == NULL) {

569

free(index);

570

return NULL;

571

}

572

last_entry = entries + num_entries;

573

for (i = 0; i < hsize; i++) {

574

/*

575

* Coalesce all entries belonging in one hash bucket

576

* into consecutive array entries.

577

* The entries in old_index all come before 'entries'.

578

*/

579

packed_hash[i] = packed_entry;

580

/* Copy any of the old entries across */

581

/* Would we rather use memcpy? */

582

if (hmask == old_index->hash_mask) {

583

for (entry = old_index->hash[i];

584

entry < old_index->hash[i+1] && entry->ptr != NULL;

585

++entry) {

586

assert((entry->val & hmask) == i);

587

*packed_entry++ = *entry;

588

}

589

} else {

590

/* If we resized the index from this action, all of the old values

591

* will be found in the previous location, but they will end up

592

* spread across the new locations.

593

*/

594

j = i & old_index->hash_mask;

595

for (entry = old_index->hash[j];

596

entry < old_index->hash[j+1] && entry->ptr != NULL;

597

++entry) {

598

assert((entry->val & old_index->hash_mask) == j);

599

if ((entry->val & hmask) == i) {

600

/* Any entries not picked up here will be picked up on the

601

* next pass.

602

*/

603

*packed_entry++ = *entry;

604

}

605

}

606

}

607

/* Now see if we need to insert any of the new entries.

608

* Note that loop ends up O(hsize*num_entries), so we expect that

609

* num_entries is always small.

610

* We also help a little bit by collapsing the entry range when the

611

* endpoints are inserted. However, an alternative would be to build a

612

* quick hash lookup for just the new entries.

613

* Testing shows that this list can easily get up to about 100

614

* entries, the tradeoff is a malloc, 1 pass over the entries, copying

615

* them into a sorted buffer, and a free() when done,

616

*/

617

for (unpacked_entry = mini_hash[i];

618

unpacked_entry;

619

unpacked_entry = unpacked_entry->next) {

620

assert((unpacked_entry->p_entry->val & hmask) == i);

621

*packed_entry++ = *(unpacked_entry->p_entry);

622

}

623

/* Now insert some extra nulls */

624

for (j = 0; j < EXTRA_NULLS; ++j) {

625

*packed_entry++ = null_entry;

626

}

627

}

628

free(mini_hash);

629

630

/* Sentinel value to indicate the length of the last hash bucket */

631

packed_hash[hsize] = packed_entry;

632

633

if ((packed_entry - (struct index_entry *)mem)

634

!= (total_num_entries + hsize*EXTRA_NULLS)) {

635

fprintf(stderr, "We expected %d entries, but created %d\n",

636

total_num_entries + hsize*EXTRA_NULLS,

637

(int)(packed_entry - (struct index_entry*)mem));

638

fflush(stderr);

639

}

640

assert((packed_entry - (struct index_entry *)mem)

641

== (total_num_entries + hsize * EXTRA_NULLS));

642

index->last_entry = (packed_entry - 1);

643

return index;

644

}

645

646

647

void

648

get_text(char buff[128], const unsigned char *ptr)

649

{

650

unsigned int i;

651

const unsigned char *start;

652

unsigned char cmd;

653

start = (ptr-RABIN_WINDOW-1);

654

cmd = *(start);

655

if (cmd < 0x80) {// This is likely to be an insert instruction

656

if (cmd < RABIN_WINDOW) {

657

cmd = RABIN_WINDOW;

658

}

659

} else {

660

/* This was either a copy [should never be] or it

661

* was a longer insert so the insert start happened at 16 more

662

* bytes back.

663

*/

664

cmd = RABIN_WINDOW + 1;

665

}

666

if (cmd > 60) {

667

cmd = 60; /* Be friendly to 80char terms */

668

}

669

/* Copy the 1 byte command, and 4 bytes after the insert */

670

cmd += 5;

671

memcpy(buff, start, cmd);

672

buff[cmd] = 0;

673

for (i = 0; i < cmd; ++i) {

674

if (buff[i] == '\n') {

675

buff[i] = 'N';

676

} else if (buff[i] == '\t') {

677

buff[i] = 'T';

678

}

679

}

680

}

681

682

struct delta_index *

683

create_delta_index_from_delta(const struct source_info *src,

684

struct delta_index *old_index)

685

{

686

unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;

687

unsigned int hash_offset;

688

const unsigned char *data, *buffer, *top;

689

unsigned char cmd;

690

struct delta_index *new_index;

691

struct index_entry *entry, *entries, *old_entry;

692

693

if (!src->buf || !src->size)

694

return NULL;

695

buffer = src->buf;

696

top = buffer + src->size;

697

698

/* Determine index hash size. Note that indexing skips the

699

first byte to allow for optimizing the Rabin's polynomial

700

initialization in create_delta().

701

This computes the maximum number of entries that could be held. The

702

actual number will be recomputed during processing.

703

*/

704

705

max_num_entries = (src->size - 1) / RABIN_WINDOW;

706

707

/* allocate an array to hold whatever entries we find */

708

entries = malloc(sizeof(*entry) * max_num_entries);

709

if (!entries) /* malloc failure */

710

return NULL;

711

712

/* then populate the index for the new data */

713

prev_val = ~0;

714

data = buffer;

715

/* target size */

716

get_delta_hdr_size(&data, top);

717

entry = entries; /* start at the first slot */

718

num_entries = 0; /* calculate the real number of entries */

719

while (data < top) {

720

cmd = *data++;

721

if (cmd & 0x80) {

722

/* Copy instruction, skip it */

723

if (cmd & 0x01) data++;

724

if (cmd & 0x02) data++;

725

if (cmd & 0x04) data++;

726

if (cmd & 0x08) data++;

727

if (cmd & 0x10) data++;

728

if (cmd & 0x20) data++;

729

if (cmd & 0x40) data++;

730

} else if (cmd) {

731

/* Insert instruction, we want to index these bytes */

732

if (data + cmd > top) {

733

/* Invalid insert, not enough bytes in the delta */

734

break;

735

}

736

/* The create_delta code requires a match at least 4 characters

737

* (including only the last char of the RABIN_WINDOW) before it

738

* will consider it something worth copying rather than inserting.

739

* So we don't want to index anything that we know won't ever be a

740

* match.

741

*/

742

for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,

743

data += RABIN_WINDOW) {

744

unsigned int val = 0;

745

for (i = 1; i <= RABIN_WINDOW; i++)

746

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

747

if (val != prev_val) {

748

/* Only keep the first of consecutive data */

749

prev_val = val;

750

num_entries++;

751

entry->ptr = data + RABIN_WINDOW;

752

entry->val = val;

753

entry->src = src;

754

entry++;

755

if (num_entries > max_num_entries) {

756

/* We ran out of entry room, something is really wrong

757

*/

758

break;

759

}

760

}

761

}

762

/* Move the data pointer by whatever remainder is left */

763

data += cmd;

764

} else {

765

/*

766

* cmd == 0 is reserved for future encoding

767

* extensions. In the mean time we must fail when

768

* encountering them (might be data corruption).

769

*/

770

break;

771

}

772

}

773

if (data != top) {

774

/* Something was wrong with this delta */

775

free(entries);

776

return NULL;

777

}

778

if (num_entries == 0) {

779

/** Nothing to index **/

780

free(entries);

781

return NULL;

782

}

783

assert(old_index != NULL);

784

old_index->last_src = src;

785

/* See if we can fill in these values into the holes in the array */

786

entry = entries;

787

num_inserted = 0;

788

for (; num_entries > 0; --num_entries, ++entry) {

789

hash_offset = (entry->val & old_index->hash_mask);

790

/* The basic structure is a hash => packed_entries that fit in that

791

* hash bucket. Things are structured such that the hash-pointers are

792

* strictly ordered. So we start by pointing to the next pointer, and

793

* walk back until we stop getting NULL targets, and then go back

794

* forward. If there are no NULL targets, then we know because

795

* entry->ptr will not be NULL.

796

*/

797

old_entry = old_index->hash[hash_offset + 1];

798

old_entry--;

799

while (old_entry->ptr == NULL

800

&& old_entry >= old_index->hash[hash_offset]) {

801

old_entry--;

802

}

803

old_entry++;

804

if (old_entry->ptr != NULL

805

|| old_entry >= old_index->hash[hash_offset + 1]) {

806

/* There is no room for this entry, we have to resize */

807

// char buff[128];

808

// get_text(buff, entry->ptr);

809

// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",

810

// hash_offset, entry->val, buff);

811

// for (old_entry = old_index->hash[hash_offset];

812

// old_entry < old_index->hash[hash_offset+1];

813

// ++old_entry) {

814

// get_text(buff, old_entry->ptr);

815

// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",

816

// (int)(old_entry - old_index->hash[hash_offset]),

817

// old_entry->val, old_entry->ptr, buff);

818

// }

819

break;

820

}

821

num_inserted++;

822

*old_entry = *entry;

823

/* For entries which we *do* manage to insert into old_index, we don't

824

* want them double copied into the final output.

825

*/

826

old_index->num_entries++;

827

}

828

if (num_entries > 0) {

829

/* We couldn't fit the new entries into the old index, so allocate a

830

* new one, and fill it with stuff.

831

*/

832

// fprintf(stderr, "inserted %d before resize\n", num_inserted);

833

new_index = create_index_from_old_and_new_entries(old_index,

834

entry, num_entries);

835

} else {

836

new_index = NULL;

837

// fprintf(stderr, "inserted %d without resizing\n", num_inserted);

838

}

839

free(entries);

840

return new_index;

841

}

842

843

void free_delta_index(struct delta_index *index)

844

{

845

free(index);

846

}

847

848

unsigned long sizeof_delta_index(struct delta_index *index)

849

{

850

if (index)

851

return index->memsize;

852

else

853

return 0;

854

}

855

856

/*

857

* The maximum size for any opcode sequence, including the initial header

858

* plus Rabin window plus biggest copy.

859

*/

860

#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)

861

862

void *

863

create_delta(const struct delta_index *index,

864

const void *trg_buf, unsigned long trg_size,

865

unsigned long *delta_size, unsigned long max_size)

866

{

867

unsigned int i, outpos, outsize, moff, msize, val;

868

const struct source_info *msource;

869

int inscnt;

870

const unsigned char *ref_data, *ref_top, *data, *top;

871

unsigned char *out;

872

unsigned long source_size;

873

874

if (!trg_buf || !trg_size)

875

return NULL;

876

if (index == NULL)

877

return NULL;

878

879

outpos = 0;

880

outsize = 8192;

881

if (max_size && outsize >= max_size)

882

outsize = max_size + MAX_OP_SIZE + 1;

883

out = malloc(outsize);

884

if (!out)

885

return NULL;

886

887

source_size = index->last_src->size + index->last_src->agg_offset;

888

889

/* store target buffer size */

890

i = trg_size;

891

while (i >= 0x80) {

892

out[outpos++] = i | 0x80;

893

i >>= 7;

894

}

895

out[outpos++] = i;

896

897

data = trg_buf;

898

top = (const unsigned char *) trg_buf + trg_size;

899

900

/* Start the matching by filling out with a simple 'insert' instruction, of

901

* the first RABIN_WINDOW bytes of the input.

902

*/

903

outpos++; /* leave a byte for the insert command */

904

val = 0;

905

for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {

906

out[outpos++] = *data;

907

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

908

}

909

/* we are now setup with an insert of 'i' bytes and val contains the RABIN

910

* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of

911

* input.

912

*/

913

inscnt = i;

914

915

moff = 0;

916

msize = 0;

917

msource = NULL;

918

while (data < top) {

919

if (msize < 4096) {

920

/* we don't have a 'worthy enough' match yet, so let's look for

921

* one.

922

*/

923

struct index_entry *entry;

924

/* Shift the window by one byte. */

925

val ^= U[data[-RABIN_WINDOW]];

926

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

927

i = val & index->hash_mask;

928

/* TODO: When using multiple indexes like this, the hash tables

929

* mapping val => index_entry become less efficient.

930

* You end up getting a lot more collisions in the hash,

931

* which doesn't actually lead to a entry->val match.

932

*/

933

for (entry = index->hash[i];

934

entry < index->hash[i+1] && entry->src != NULL;

935

entry++) {

936

const unsigned char *ref;

937

const unsigned char *src;

938

unsigned int ref_size;

939

if (entry->val != val)

940

continue;

941

ref = entry->ptr;

942

src = data;

943

ref_data = entry->src->buf;

944

ref_top = ref_data + entry->src->size;

945

ref_size = ref_top - ref;

946

/* ref_size is the longest possible match that we could make

947

* here. If ref_size <= msize, then we know that we cannot

948

* match more bytes with this location that we have already

949

* matched.

950

*/

951

if (ref_size > top - src)

952

ref_size = top - src;

953

if (ref_size <= msize)

954

break;

955

/* See how many bytes actually match at this location. */

956

while (ref_size-- && *src++ == *ref)

957

ref++;

958

if (msize < ref - entry->ptr) {

959

/* this is our best match so far */

960

msize = ref - entry->ptr;

961

msource = entry->src;

962

moff = entry->ptr - ref_data;

963

if (msize >= 4096) /* good enough */

964

break;

965

}

966

}

967

}

968

969

if (msize < 4) {

970

/* The best match right now is less than 4 bytes long. So just add

971

* the current byte to the insert instruction. Increment the insert

972

* counter, and copy the byte of data into the output buffer.

973

*/

974

if (!inscnt)

975

outpos++;

976

out[outpos++] = *data++;

977

inscnt++;

978

if (inscnt == 0x7f) {

979

/* We have a max length insert instruction, finalize it in the

980

* output.

981

*/

982

out[outpos - inscnt - 1] = inscnt;

983

inscnt = 0;

984

}

985

msize = 0;

986

} else {

987

unsigned int left;

988

unsigned char *op;

989

990

if (inscnt) {

991

ref_data = msource->buf;

992

while (moff && ref_data[moff-1] == data[-1]) {

993

/* we can match one byte back */

994

msize++;

995

moff--;

996

data--;

997

outpos--;

998

if (--inscnt)

999

continue;

1000

outpos--; /* remove count slot */

1001

inscnt--; /* make it -1 */

1002

break;

1003

}

1004

out[outpos - inscnt - 1] = inscnt;

1005

inscnt = 0;

1006

}

1007

1008

/* A copy op is currently limited to 64KB (pack v2) */

1009

left = (msize < 0x10000) ? 0 : (msize - 0x10000);

1010

msize -= left;

1011

1012

op = out + outpos++;

1013

i = 0x80;

1014

1015

/* moff is the offset in the local structure, for encoding, we need

1016

* to push it into the global offset

1017

*/

1018

assert(moff < msource->size);

1019

moff += msource->agg_offset;

1020

assert(moff + msize <= source_size);

1021

if (moff & 0x000000ff)

1022

out[outpos++] = moff >> 0, i |= 0x01;

1023

if (moff & 0x0000ff00)

1024

out[outpos++] = moff >> 8, i |= 0x02;

1025

if (moff & 0x00ff0000)

1026

out[outpos++] = moff >> 16, i |= 0x04;

1027

if (moff & 0xff000000)

1028

out[outpos++] = moff >> 24, i |= 0x08;

1029

/* Put it back into local coordinates, in case we have multiple

1030

* copies in a row.

1031

*/

1032

moff -= msource->agg_offset;

1033

1034

if (msize & 0x00ff)

1035

out[outpos++] = msize >> 0, i |= 0x10;

1036

if (msize & 0xff00)

1037

out[outpos++] = msize >> 8, i |= 0x20;

1038

1039

*op = i;

1040

1041

data += msize;

1042

moff += msize;

1043

msize = left;

1044

1045

if (msize < 4096) {

1046

int j;

1047

val = 0;

1048

for (j = -RABIN_WINDOW; j < 0; j++)

1049

val = ((val << 8) | data[j])

1050

^ T[val >> RABIN_SHIFT];

1051

}

1052

}

1053

1054

if (outpos >= outsize - MAX_OP_SIZE) {

1055

void *tmp = out;

1056

outsize = outsize * 3 / 2;

1057

if (max_size && outsize >= max_size)

1058

outsize = max_size + MAX_OP_SIZE + 1;

1059

if (max_size && outpos > max_size)

1060

break;

1061

out = realloc(out, outsize);

1062

if (!out) {

1063

free(tmp);

1064

return NULL;

1065

}

1066

}

1067

}

1068

1069

if (inscnt)

1070

out[outpos - inscnt - 1] = inscnt;

1071

1072

if (max_size && outpos > max_size) {

1073

free(out);

1074

return NULL;

1075

}

1076

1077

*delta_size = outpos;

1078

return out;

1079

}

1080

1081

/* vim: et ts=4 sw=4 sts=4

1082

*/