~bzr-pqm/bzr/bzr.dev : revision 0.17.31

1

/*

2

* diff-delta.c: generate a delta between two buffers

3

*

4

* This code was greatly inspired by parts of LibXDiff from Davide Libenzi

5

* http://www.xmailserver.org/xdiff-lib.html

6

*

7

8

9

*

10

* This code is free software; you can redistribute it and/or modify

11

* it under the terms of the GNU General Public License version 2 as

12

* published by the Free Software Foundation.

13

*/

14

15

#include "delta.h"

16

#include <stdlib.h>

17

#include <string.h>

18

#include <assert.h>

19

20

/* maximum hash entry list for the same hash bucket */

21

#define HASH_LIMIT 64

22

23

#define RABIN_SHIFT 23

24

#define RABIN_WINDOW 16

25

26

static const unsigned int T[256] = {

27

0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,

28

0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,

29

0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,

30

0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,

31

0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,

32

0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,

33

0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,

34

0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,

35

0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,

36

0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,

37

0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,

38

0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,

39

0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,

40

0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,

41

0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,

42

0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,

43

0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,

44

0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,

45

0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,

46

0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,

47

0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,

48

0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,

49

0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,

50

0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,

51

0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,

52

0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,

53

0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,

54

0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,

55

0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,

56

0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,

57

0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,

58

0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,

59

0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,

60

0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,

61

0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,

62

0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,

63

0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,

64

0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,

65

0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,

66

0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,

67

0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,

68

0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,

69

0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811

70

};

71

72

static const unsigned int U[256] = {

73

0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,

74

0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,

75

0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,

76

0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,

77

0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,

78

0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,

79

0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,

80

0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,

81

0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,

82

0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,

83

0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,

84

0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,

85

0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,

86

0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,

87

0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,

88

0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,

89

0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,

90

0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,

91

0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,

92

0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,

93

0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,

94

0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,

95

0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,

96

0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,

97

0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,

98

0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,

99

0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,

100

0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,

101

0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,

102

0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,

103

0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,

104

0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,

105

0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,

106

0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,

107

0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,

108

0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,

109

0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,

110

0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,

111

0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,

112

0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,

113

0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,

114

0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,

115

0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a

116

};

117

118

struct index_entry {

119

const unsigned char *ptr;

120

const struct source_info *src;

121

unsigned int val;

122

};

123

124

struct unpacked_index_entry {

125

struct index_entry entry;

126

struct unpacked_index_entry *next;

127

};

128

129

struct delta_index {

130

unsigned long memsize; /* Total bytes pointed to by this index */

131

const struct source_info *last_src; /* Information about the referenced source */

132

unsigned int hash_mask; /* val & hash_mask gives the hash index for a given

133

entry */

134

unsigned int num_entries; /* The total number of entries in this index */

135

struct index_entry *last_entry; /* Pointer to the last valid entry */

136

struct index_entry *hash[];

137

};

138

139

static unsigned int

140

limit_hash_buckets(struct unpacked_index_entry **hash,

141

unsigned int *hash_count, unsigned int hsize,

142

unsigned int entries)

143

{

144

struct unpacked_index_entry *entry;

145

unsigned int i;

146

/*

147

* Determine a limit on the number of entries in the same hash

148

* bucket. This guards us against pathological data sets causing

149

* really bad hash distribution with most entries in the same hash

150

* bucket that would bring us to O(m*n) computing costs (m and n

151

* corresponding to reference and target buffer sizes).

152

*

153

* Make sure none of the hash buckets has more entries than

154

* we're willing to test. Otherwise we cull the entry list

155

* uniformly to still preserve a good repartition across

156

* the reference buffer.

157

*/

158

for (i = 0; i < hsize; i++) {

159

int acc;

160

161

if (hash_count[i] <= HASH_LIMIT)

162

continue;

163

164

/* We leave exactly HASH_LIMIT entries in the bucket */

165

entries -= hash_count[i] - HASH_LIMIT;

166

167

entry = hash[i];

168

acc = 0;

169

170

/*

171

* Assume that this loop is gone through exactly

172

* HASH_LIMIT times and is entered and left with

173

* acc==0. So the first statement in the loop

174

* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT

175

* to the accumulator, and the inner loop consequently

176

* is run (hash_count[i]-HASH_LIMIT) times, removing

177

* one element from the list each time. Since acc

178

* balances out to 0 at the final run, the inner loop

179

* body can't be left with entry==NULL. So we indeed

180

* encounter entry==NULL in the outer loop only.

181

*/

182

do {

183

acc += hash_count[i] - HASH_LIMIT;

184

if (acc > 0) {

185

struct unpacked_index_entry *keep = entry;

186

do {

187

entry = entry->next;

188

acc -= HASH_LIMIT;

189

} while (acc > 0);

190

keep->next = entry->next;

191

}

192

entry = entry->next;

193

} while (entry);

194

}

195

return entries;

196

}

197

198

static struct delta_index *

199

pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,

200

unsigned int num_entries)

201

{

202

unsigned int i, memsize;

203

struct unpacked_index_entry *entry;

204

struct delta_index *index;

205

struct index_entry *packed_entry, **packed_hash;

206

void *mem;

207

/*

208

* Now create the packed index in array form

209

* rather than linked lists.

210

*/

211

memsize = sizeof(*index)

212

+ sizeof(*packed_hash) * (hsize+1)

213

+ sizeof(*packed_entry) * num_entries;

214

mem = malloc(memsize);

215

if (!mem) {

216

return NULL;

217

}

218

219

index = mem;

220

index->memsize = memsize;

221

index->hash_mask = hsize - 1;

222

index->num_entries = num_entries;

223

224

mem = index->hash;

225

packed_hash = mem;

226

mem = packed_hash + (hsize+1);

227

packed_entry = mem;

228

229

for (i = 0; i < hsize; i++) {

230

/*

231

* Coalesce all entries belonging to one linked list

232

* into consecutive array entries.

233

*/

234

packed_hash[i] = packed_entry;

235

for (entry = hash[i]; entry; entry = entry->next)

236

*packed_entry++ = entry->entry;

237

}

238

239

/* Sentinel value to indicate the length of the last hash bucket */

240

packed_hash[hsize] = packed_entry;

241

242

assert(packed_entry - (struct index_entry *)mem == num_entries);

243

index->last_entry = (packed_entry - 1);

244

return index;

245

}

246

247

void include_entries_from_index(struct unpacked_index_entry **hash,

248

unsigned int *hash_count,

249

unsigned int hsize,

250

struct unpacked_index_entry *entry,

251

const struct delta_index *old)

252

{

253

unsigned int i, hmask, masked_val;

254

struct index_entry *old_entry;

255

256

hmask = hsize - 1;

257

/* Iterate backwards through the existing entries

258

* This is so that matches early in the file end up being the first pointer

259

* in the linked list.

260

* TODO: We know that all old entries are going to occur before the new

261

* entries, and that we are going to follow this with a limit and

262

* then pack step. There is probably a way we could optimize this

263

* step, so that we wouldn't have to copy everything into a new

264

* buffer, and then copy everything again into yet another buffer.

265

*/

266

for (old_entry = old->last_entry, i = 0; i < old->num_entries;

267

i++, old_entry--) {

268

masked_val = old_entry->val & hmask;

269

entry->entry = *old_entry;

270

entry->next = hash[masked_val];

271

hash[masked_val] = entry++;

272

hash_count[masked_val]++;

273

}

274

}

275

276

struct delta_index *

277

create_index_from_old_and_hash(const struct delta_index *old,

278

struct unpacked_index_entry **hash,

279

unsigned int hsize,

280

unsigned int num_entries)

281

{

282

unsigned int i, memsize, total_num_entries;

283

struct unpacked_index_entry *entry;

284

struct delta_index *index;

285

struct index_entry *packed_entry, **packed_hash;

286

struct index_entry *copy_from_start, *copy_to_start;

287

size_t total_copy, to_copy;

288

unsigned int num_ops;

289

unsigned int bytes_copied;

290

void *mem;

291

/*

292

* Now create the packed index in array form

293

* rather than linked lists.

294

*/

295

total_num_entries = num_entries + old->num_entries;

296

memsize = sizeof(*index)

297

+ sizeof(*packed_hash) * (hsize+1)

298

+ sizeof(*packed_entry) * total_num_entries;

299

mem = malloc(memsize);

300

if (!mem) {

301

return NULL;

302

}

303

304

index = mem;

305

index->memsize = memsize;

306

index->hash_mask = hsize - 1;

307

index->num_entries = total_num_entries;

308

309

mem = index->hash;

310

packed_hash = mem;

311

mem = packed_hash + (hsize+1);

312

packed_entry = mem;

313

314

total_copy = 0;

315

bytes_copied = 0;

316

num_ops = 0;

317

copy_from_start = copy_to_start = NULL;

318

for (i = 0; i < hsize; i++) {

319

/*

320

* Coalesce all entries belonging to one linked list

321

* into consecutive array entries.

322

* The entries in old all come before the entries in hash.

323

*/

324

packed_hash[i] = packed_entry;

325

to_copy = (old->hash[i+1] - old->hash[i]);

326

if (to_copy > 0) {

327

/* This next range can be copied wholesale. However, we will wait

328

* until we have to insert something from the new data before we

329

* copy everything.

330

* So for now, just reserve space for the copy.

331

*/

332

if (total_copy == 0) {

333

copy_from_start = old->hash[i];

334

copy_to_start = packed_entry;

335

}

336

packed_entry += to_copy;

337

total_copy += to_copy;

338

assert((packed_entry - copy_to_start) == total_copy);

339

assert((old->hash[i+1] - copy_from_start) == total_copy);

340

}

341

for (entry = hash[i]; entry; entry = entry->next) {

342

/* We have an entry to insert, so flush the copy buffer */

343

if (total_copy > 0) {

344

assert(copy_to_start != NULL);

345

assert(copy_from_start != NULL);

346

memcpy(copy_to_start, copy_from_start,

347

total_copy * sizeof(struct index_entry));

348

bytes_copied += (total_copy * sizeof(struct index_entry));

349

total_copy = 0;

350

copy_from_start = copy_to_start = NULL;

351

num_ops += 1;

352

}

353

*packed_entry++ = entry->entry;

354

}

355

}

356

if (total_copy > 0) {

357

assert(copy_to_start != NULL);

358

assert(copy_from_start != NULL);

359

memcpy(copy_to_start, copy_from_start,

360

total_copy * sizeof(struct index_entry));

361

bytes_copied += (total_copy * sizeof(struct index_entry));

362

num_ops += 1;

363

}

364

365

/* Sentinel value to indicate the length of the last hash bucket */

366

packed_hash[hsize] = packed_entry;

367

368

assert(packed_entry - (struct index_entry *)mem == total_num_entries);

369

index->last_entry = (packed_entry - 1);

370

return index;

371

}

372

373

374

struct delta_index * create_delta_index(const struct source_info *src,

375

const struct delta_index *old)

376

{

377

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

378

unsigned int total_num_entries;

379

const unsigned char *data, *buffer;

380

struct delta_index *index;

381

struct unpacked_index_entry *entry, **hash;

382

void *mem;

383

unsigned long memsize;

384

385

if (!src->buf || !src->size)

386

return NULL;

387

buffer = src->buf;

388

389

/* Determine index hash size. Note that indexing skips the

390

first byte to allow for optimizing the Rabin's polynomial

391

initialization in create_delta(). */

392

num_entries = (src->size - 1) / RABIN_WINDOW;

393

if (old != NULL)

394

total_num_entries = num_entries + old->num_entries;

395

else

396

total_num_entries = num_entries;

397

hsize = total_num_entries / 4;

398

for (i = 4; (1u << i) < hsize && i < 31; i++);

399

hsize = 1 << i;

400

hmask = hsize - 1;

401

402

/* allocate lookup index */

403

memsize = sizeof(*hash) * hsize +

404

sizeof(*entry) * total_num_entries;

405

mem = malloc(memsize);

406

if (!mem)

407

return NULL;

408

hash = mem;

409

mem = hash + hsize;

410

entry = mem;

411

412

memset(hash, 0, hsize * sizeof(*hash));

413

414

/* allocate an array to count hash num_entries */

415

hash_count = calloc(hsize, sizeof(*hash_count));

416

if (!hash_count) {

417

free(hash);

418

return NULL;

419

}

420

421

/* then populate the index for the new data */

422

prev_val = ~0;

423

for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;

424

data >= buffer;

425

data -= RABIN_WINDOW) {

426

unsigned int val = 0;

427

for (i = 1; i <= RABIN_WINDOW; i++)

428

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

429

if (val == prev_val) {

430

/* keep the lowest of consecutive identical blocks */

431

entry[-1].entry.ptr = data + RABIN_WINDOW;

432

--num_entries;

433

--total_num_entries;

434

} else {

435

prev_val = val;

436

i = val & hmask;

437

entry->entry.ptr = data + RABIN_WINDOW;

438

entry->entry.val = val;

439

entry->entry.src = src;

440

entry->next = hash[i];

441

hash[i] = entry++;

442

hash_count[i]++;

443

}

444

}

445

/* If we have an existing delta_index, we want to bring its info into the

446

* new structure. We assume that the existing structure's objects all occur

447

* before the new source, so they get first precedence in the index.

448

*/

449

if (old != NULL)

450

include_entries_from_index(hash, hash_count, hsize, entry, old);

451

452

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

453

total_num_entries);

454

free(hash_count);

455

index = pack_delta_index(hash, hsize, total_num_entries);

456

index->last_src = src;

457

free(hash);

458

if (!index) {

459

return NULL;

460

}

461

return index;

462

}

463

464

struct delta_index *

465

create_delta_index_from_delta(const struct source_info *src,

466

const struct delta_index *old)

467

{

468

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

469

unsigned int total_num_entries;

470

const unsigned char *data, *buffer, *top;

471

unsigned char cmd;

472

struct delta_index *index;

473

struct unpacked_index_entry *entry, **hash;

474

void *mem;

475

unsigned long memsize;

476

477

if (!src->buf || !src->size)

478

return NULL;

479

buffer = src->buf;

480

top = buffer + src->size;

481

482

/* Determine index hash size. Note that indexing skips the

483

first byte to allow for optimizing the Rabin's polynomial

484

initialization in create_delta().

485

This computes the maximum number of entries that could be held. The

486

actual number will be recomputed during processing.

487

*/

488

489

num_entries = (src->size - 1) / RABIN_WINDOW;

490

if (old != NULL)

491

total_num_entries = num_entries + old->num_entries;

492

else

493

total_num_entries = num_entries;

494

hsize = total_num_entries / 4;

495

for (i = 4; (1u << i) < hsize && i < 31; i++);

496

hsize = 1 << i;

497

hmask = hsize - 1;

498

499

/* allocate lookup index */

500

memsize = sizeof(*hash) * hsize +

501

sizeof(*entry) * total_num_entries;

502

mem = malloc(memsize);

503

if (!mem)

504

return NULL;

505

hash = mem;

506

mem = hash + hsize;

507

entry = mem;

508

509

memset(hash, 0, hsize * sizeof(*hash));

510

511

/* allocate an array to count hash num_entries */

512

hash_count = calloc(hsize, sizeof(*hash_count));

513

if (!hash_count) {

514

free(hash);

515

return NULL;

516

}

517

518

/* then populate the index for the new data */

519

/* The create_delta_index code starts at the end and works backward,

520

* because it is easier to update the entry pointers (you always update the

521

* head, rather than updating the tail).

522

* However, we can't walk the delta structure that way.

523

*/

524

prev_val = ~0;

525

data = buffer;

526

/* source size */

527

get_delta_hdr_size(&data, top);

528

/* target size */

529

get_delta_hdr_size(&data, top);

530

num_entries = 0; /* calculate the real number of entries */

531

while (data < top) {

532

cmd = *data++;

533

if (cmd & 0x80) {

534

/* Copy instruction, skip it */

535

if (cmd & 0x01) data++;

536

if (cmd & 0x02) data++;

537

if (cmd & 0x04) data++;

538

if (cmd & 0x08) data++;

539

if (cmd & 0x10) data++;

540

if (cmd & 0x20) data++;

541

if (cmd & 0x40) data++;

542

} else if (cmd) {

543

/* Insert instruction, we want to index these bytes */

544

if (data + cmd > top) {

545

/* Invalid insert, not enough bytes in the delta */

546

break;

547

}

548

for (; cmd > RABIN_WINDOW; cmd -= RABIN_WINDOW,

549

data += RABIN_WINDOW) {

550

unsigned int val = 0;

551

for (i = 1; i <= RABIN_WINDOW; i++)

552

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

553

if (val != prev_val) {

554

/* Only keep the first of consecutive data */

555

prev_val = val;

556

i = val & hmask;

557

entry->entry.ptr = data + RABIN_WINDOW;

558

entry->entry.val = val;

559

entry->entry.src = src;

560

entry->next = NULL;

561

/* Now we have to insert this entry at the end of the hash

562

* map

563

*/

564

if (hash[i] == NULL) {

565

hash[i] = entry;

566

} else {

567

struct unpacked_index_entry *this;

568

for (this = hash[i];

569

this->next != NULL;

570

this = this->next) { /* No action */ }

571

this->next = entry;

572

this = entry;

573

}

574

hash_count[i]++;

575

entry++;

576

num_entries++;

577

}

578

}

579

/* Move the data pointer by whatever remainder is left */

580

data += cmd;

581

} else {

582

/*

583

* cmd == 0 is reserved for future encoding

584

* extensions. In the mean time we must fail when

585

* encountering them (might be data corruption).

586

*/

587

break;

588

}

589

}

590

if (data != top) {

591

/* Something was wrong with this delta */

592

free(hash);

593

free(hash_count);

594

return NULL;

595

}

596

if (num_entries == 0) {

597

/** Nothing to index **/

598

free(hash);

599

free(hash_count);

600

return NULL;

601

}

602

if (old != NULL) {

603

if (hmask == old->hash_mask) {

604

/* The total hash table size didn't change, which means that

605

* entries will end up in the same pages. We can do bulk-copying to

606

* get the final output

607

*/

608

index = create_index_from_old_and_hash(old, hash, hsize,

609

num_entries);

610

free(hash_count);

611

free(hash);

612

if (!index) {

613

return NULL;

614

}

615

index->last_src = src;

616

return index;

617

} else {

618

total_num_entries = num_entries + old->num_entries;

619

include_entries_from_index(hash, hash_count, hsize, entry, old);

620

}

621

} else {

622

total_num_entries = num_entries;

623

}

624

625

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

626

total_num_entries);

627

free(hash_count);

628

index = pack_delta_index(hash, hsize, total_num_entries);

629

free(hash);

630

if (!index) {

631

return NULL;

632

}

633

index->last_src = src;

634

return index;

635

}

636

637

void free_delta_index(struct delta_index *index)

638

{

639

free(index);

640

}

641

642

unsigned long sizeof_delta_index(struct delta_index *index)

643

{

644

if (index)

645

return index->memsize;

646

else

647

return 0;

648

}

649

650

/*

651

* The maximum size for any opcode sequence, including the initial header

652

* plus Rabin window plus biggest copy.

653

*/

654

#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)

655

656

void *

657

create_delta(const struct delta_index *index,

658

const void *trg_buf, unsigned long trg_size,

659

unsigned long *delta_size, unsigned long max_size)

660

{

661

unsigned int i, outpos, outsize, moff, msize, val;

662

const struct source_info *msource;

663

int inscnt;

664

const unsigned char *ref_data, *ref_top, *data, *top;

665

unsigned char *out;

666

unsigned long source_size;

667

668

if (!trg_buf || !trg_size)

669

return NULL;

670

if (index == NULL)

671

return NULL;

672

673

outpos = 0;

674

outsize = 8192;

675

if (max_size && outsize >= max_size)

676

outsize = max_size + MAX_OP_SIZE + 1;

677

out = malloc(outsize);

678

if (!out)

679

return NULL;

680

681

/* store reference buffer size */

682

source_size = index->last_src->size + index->last_src->agg_offset;

683

i = source_size;

684

while (i >= 0x80) {

685

out[outpos++] = i | 0x80;

686

i >>= 7;

687

}

688

out[outpos++] = i;

689

690

/* store target buffer size */

691

i = trg_size;

692

while (i >= 0x80) {

693

out[outpos++] = i | 0x80;

694

i >>= 7;

695

}

696

out[outpos++] = i;

697

698

data = trg_buf;

699

top = (const unsigned char *) trg_buf + trg_size;

700

701

/* Start the matching by filling out with a simple 'insert' instruction, of

702

* the first RABIN_WINDOW bytes of the input.

703

*/

704

outpos++; /* leave a byte for the insert command */

705

val = 0;

706

for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {

707

out[outpos++] = *data;

708

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

709

}

710

/* we are now setup with an insert of 'i' bytes and val contains the RABIN

711

* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of

712

* input.

713

*/

714

inscnt = i;

715

716

moff = 0;

717

msize = 0;

718

msource = NULL;

719

while (data < top) {

720

if (msize < 4096) {

721

/* we don't have a 'worthy enough' match yet, so let's look for

722

* one.

723

*/

724

struct index_entry *entry;

725

/* Shift the window by one byte. */

726

val ^= U[data[-RABIN_WINDOW]];

727

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

728

i = val & index->hash_mask;

729

/* TODO: When using multiple indexes like this, the hash tables

730

* mapping val => index_entry become less efficient.

731

* You end up getting a lot more collisions in the hash,

732

* which doesn't actually lead to a entry->val match.

733

*/

734

for (entry = index->hash[i]; entry < index->hash[i+1];

735

entry++) {

736

const unsigned char *ref;

737

const unsigned char *src;

738

unsigned int ref_size;

739

if (entry->val != val)

740

continue;

741

ref = entry->ptr;

742

src = data;

743

ref_data = entry->src->buf;

744

ref_top = ref_data + entry->src->size;

745

ref_size = ref_top - ref;

746

/* ref_size is the longest possible match that we could make

747

* here. If ref_size <= msize, then we know that we cannot

748

* match more bytes with this location that we have already

749

* matched.

750

*/

751

if (ref_size > top - src)

752

ref_size = top - src;

753

if (ref_size <= msize)

754

break;

755

/* See how many bytes actually match at this location. */

756

while (ref_size-- && *src++ == *ref)

757

ref++;

758

if (msize < ref - entry->ptr) {

759

/* this is our best match so far */

760

msize = ref - entry->ptr;

761

msource = entry->src;

762

moff = entry->ptr - ref_data;

763

if (msize >= 4096) /* good enough */

764

break;

765

}

766

}

767

}

768

769

if (msize < 4) {

770

/* The best match right now is less than 4 bytes long. So just add

771

* the current byte to the insert instruction. Increment the insert

772

* counter, and copy the byte of data into the output buffer.

773

*/

774

if (!inscnt)

775

outpos++;

776

out[outpos++] = *data++;

777

inscnt++;

778

if (inscnt == 0x7f) {

779

/* We have a max length insert instruction, finalize it in the

780

* output.

781

*/

782

out[outpos - inscnt - 1] = inscnt;

783

inscnt = 0;

784

}

785

msize = 0;

786

} else {

787

unsigned int left;

788

unsigned char *op;

789

790

if (inscnt) {

791

ref_data = msource->buf;

792

while (moff && ref_data[moff-1] == data[-1]) {

793

/* we can match one byte back */

794

msize++;

795

moff--;

796

data--;

797

outpos--;

798

if (--inscnt)

799

continue;

800

outpos--; /* remove count slot */

801

inscnt--; /* make it -1 */

802

break;

803

}

804

out[outpos - inscnt - 1] = inscnt;

805

inscnt = 0;

806

}

807

808

/* A copy op is currently limited to 64KB (pack v2) */

809

left = (msize < 0x10000) ? 0 : (msize - 0x10000);

810

msize -= left;

811

812

op = out + outpos++;

813

i = 0x80;

814

815

/* moff is the offset in the local structure, for encoding, we need

816

* to push it into the global offset

817

*/

818

assert(moff < msource->size);

819

moff += msource->agg_offset;

820

assert(moff + msize <= source_size);

821

if (moff & 0x000000ff)

822

out[outpos++] = moff >> 0, i |= 0x01;

823

if (moff & 0x0000ff00)

824

out[outpos++] = moff >> 8, i |= 0x02;

825

if (moff & 0x00ff0000)

826

out[outpos++] = moff >> 16, i |= 0x04;

827

if (moff & 0xff000000)

828

out[outpos++] = moff >> 24, i |= 0x08;

829

/* Put it back into local coordinates, in case we have multiple

830

* copies in a row.

831

*/

832

moff -= msource->agg_offset;

833

834

if (msize & 0x00ff)

835

out[outpos++] = msize >> 0, i |= 0x10;

836

if (msize & 0xff00)

837

out[outpos++] = msize >> 8, i |= 0x20;

838

839

*op = i;

840

841

data += msize;

842

moff += msize;

843

msize = left;

844

845

if (msize < 4096) {

846

int j;

847

val = 0;

848

for (j = -RABIN_WINDOW; j < 0; j++)

849

val = ((val << 8) | data[j])

850

^ T[val >> RABIN_SHIFT];

851

}

852

}

853

854

if (outpos >= outsize - MAX_OP_SIZE) {

855

void *tmp = out;

856

outsize = outsize * 3 / 2;

857

if (max_size && outsize >= max_size)

858

outsize = max_size + MAX_OP_SIZE + 1;

859

if (max_size && outpos > max_size)

860

break;

861

out = realloc(out, outsize);

862

if (!out) {

863

free(tmp);

864

return NULL;

865

}

866

}

867

}

868

869

if (inscnt)

870

out[outpos - inscnt - 1] = inscnt;

871

872

if (max_size && outpos > max_size) {

873

free(out);

874

return NULL;

875

}

876

877

*delta_size = outpos;

878

return out;

879

}

880

881

/* vim: et ts=4 sw=4 sts=4

882

*/