~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/diff-delta.c

Committer: Vincent Ladeuil
Date: 2009-03-24 09:38:47 UTC
mfrom: (3735.2.165 brisbane-core)
mto: (3735.40.1 for-jam) (4194.1.3 groupcompress-python-only)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: v.ladeuil+lp@free.fr-20090324093847-zwbjrv8awx73feiz

create thread for bbc

files added:
bzrlib/_chk_map_py.py

bzrlib/_chk_map_pyx.pyx

bzrlib/_groupcompress_pyx.pyx

bzrlib/chk_map.py

bzrlib/chk_serializer.py

bzrlib/delta.h

bzrlib/diff-delta.c

bzrlib/groupcompress.py

bzrlib/repofmt/groupcompress_repo.py

bzrlib/tests/per_repository_chk

bzrlib/tests/per_repository_chk/__init__.py

bzrlib/tests/per_repository_chk/test_supported.py

bzrlib/tests/per_repository_chk/test_unsupported.py

bzrlib/tests/test__chk_map.py

bzrlib/tests/test__groupcompress_pyx.py

bzrlib/tests/test_chk_map.py

bzrlib/tests/test_groupcompress.py

doc/developers/groupcompress-design.txt

files modified:
.bzrignore

BRANCH.TODO

bzrlib/bzrdir.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/memorytree.py

bzrlib/mutabletree.py

bzrlib/reconcile.py

bzrlib/remote.py

bzrlib/repofmt/knitrepo.py

bzrlib/repofmt/pack_repo.py

bzrlib/repofmt/weaverepo.py

bzrlib/repository.py

bzrlib/revisiontree.py

bzrlib/tests/__init__.py

bzrlib/tests/intertree_implementations/__init__.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/inventory_implementations/basics.py

bzrlib/tests/per_repository/test_add_inventory_by_delta.py

bzrlib/tests/per_repository/test_check.py

bzrlib/tests/per_repository/test_commit_builder.py

bzrlib/tests/per_repository/test_repository.py

bzrlib/tests/per_repository_reference/test_add_inventory.py

bzrlib/tests/per_repository_reference/test_add_revision.py

bzrlib/tests/per_repository_reference/test_add_signature_text.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_pack_repository.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_versionedfile.py

bzrlib/tree.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/workingtree_4.py

doc/developers/development-repo.txt

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/diff-delta.c

* diff-delta.c: generate a delta between two buffers

* This code was greatly inspired by parts of LibXDiff from Davide Libenzi

* http://www.xmailserver.org/xdiff-lib.html

* This code is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License version 2 as

* published by the Free Software Foundation.

#include <stdio.h>

#include "delta.h"

#include <stdlib.h>

#include <string.h>

#include <assert.h>

/* maximum hash entry list for the same hash bucket */

#define HASH_LIMIT 64

#define RABIN_SHIFT 23

#define RABIN_WINDOW 16

/* The hash map is sized to put 4 entries per bucket, this gives us ~even room

* for more data. Tweaking this number above 4 doesn't seem to help much,

* anyway.

#define EXTRA_NULLS 4

static const unsigned int T[256] = {

0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,

0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,

0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,

0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,

0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,

0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,

0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,

0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,

0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,

0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,

0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,

0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,

0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,

0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,

0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,

0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,

0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,

0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,

0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,

0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,

0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,

0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,

0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,

0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,

0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,

0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,

0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,

0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,

0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,

0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,

0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,

0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,

0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,

0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,

0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,

0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,

0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,

0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,

0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,

0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,

0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,

0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,

0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811

};

static const unsigned int U[256] = {

0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,

0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,

0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,

0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,

0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,

0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,

0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,

0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,

0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,

0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,

0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,

0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,

0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,

0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,

0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,

0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,

0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,

0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,

0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,

100

0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,

101

0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,

102

0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,

103

0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,

104

0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,

105

0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,

106

0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,

107

0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,

108

0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,

109

0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,

110

0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,

111

0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,

112

0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,

113

0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,

114

0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,

115

0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,

116

0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,

117

0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,

118

0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,

119

0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,

120

0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,

121

0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,

122

0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,

123

0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a

124

};

125

126

struct index_entry {

127

const unsigned char *ptr;

128

const struct source_info *src;

129

unsigned int val;

130

};

131

132

struct index_entry_linked_list {

133

struct index_entry *p_entry;

134

struct index_entry_linked_list *next;

135

};

136

137

struct unpacked_index_entry {

138

struct index_entry entry;

139

struct unpacked_index_entry *next;

140

};

141

142

struct delta_index {

143

unsigned long memsize; /* Total bytes pointed to by this index */

144

const struct source_info *last_src; /* Information about the referenced source */

145

unsigned int hash_mask; /* val & hash_mask gives the hash index for a given

146

entry */

147

unsigned int num_entries; /* The total number of entries in this index */

148

struct index_entry *last_entry; /* Pointer to the last valid entry */

149

struct index_entry *hash[];

150

};

151

152

static unsigned int

153

limit_hash_buckets(struct unpacked_index_entry **hash,

154

unsigned int *hash_count, unsigned int hsize,

155

unsigned int entries)

156

{

157

struct unpacked_index_entry *entry;

158

unsigned int i;

159

160

* Determine a limit on the number of entries in the same hash

161

* bucket. This guards us against pathological data sets causing

162

* really bad hash distribution with most entries in the same hash

163

* bucket that would bring us to O(m*n) computing costs (m and n

164

* corresponding to reference and target buffer sizes).

165

166

* Make sure none of the hash buckets has more entries than

167

* we're willing to test. Otherwise we cull the entry list

168

* uniformly to still preserve a good repartition across

169

* the reference buffer.

170

171

for (i = 0; i < hsize; i++) {

172

int acc;

173

174

if (hash_count[i] <= HASH_LIMIT)

175

continue;

176

177

/* We leave exactly HASH_LIMIT entries in the bucket */

178

entries -= hash_count[i] - HASH_LIMIT;

179

180

entry = hash[i];

181

acc = 0;

182

183

184

* Assume that this loop is gone through exactly

185

* HASH_LIMIT times and is entered and left with

186

* acc==0. So the first statement in the loop

187

* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT

188

* to the accumulator, and the inner loop consequently

189

* is run (hash_count[i]-HASH_LIMIT) times, removing

190

* one element from the list each time. Since acc

191

* balances out to 0 at the final run, the inner loop

192

* body can't be left with entry==NULL. So we indeed

193

* encounter entry==NULL in the outer loop only.

194

195

do {

196

acc += hash_count[i] - HASH_LIMIT;

197

if (acc > 0) {

198

struct unpacked_index_entry *keep = entry;

199

do {

200

entry = entry->next;

201

acc -= HASH_LIMIT;

202

} while (acc > 0);

203

keep->next = entry->next;

204

}

205

entry = entry->next;

206

} while (entry);

207

}

208

return entries;

209

}

210

211

static struct delta_index *

212

pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,

213

unsigned int num_entries, struct delta_index *old_index)

214

{

215

unsigned int i, j, hmask, memsize, fit_in_old, copied_count;

216

struct unpacked_index_entry *entry;

217

struct delta_index *index;

218

struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;

219

struct index_entry null_entry = {0};

220

void *mem;

221

222

hmask = hsize - 1;

223

224

// if (old_index) {

225

// fprintf(stderr, "Packing %d entries into %d for total of %d entries"

226

// " %x => %x\n",

227

// num_entries - old_index->num_entries,

228

// old_index->num_entries, num_entries,

229

// old_index->hash_mask, hmask);

230

// } else {

231

// fprintf(stderr, "Packing %d entries into a new index\n",

232

// num_entries);

233

// }

234

/* First, see if we can squeeze the new items into the existing structure.

235

236

fit_in_old = 0;

237

copied_count = 0;

238

if (old_index && old_index->hash_mask == hmask) {

239

fit_in_old = 1;

240

for (i = 0; i < hsize; ++i) {

241

packed_entry = NULL;

242

for (entry = hash[i]; entry; entry = entry->next) {

243

if (packed_entry == NULL) {

244

/* Find the last open spot */

245

packed_entry = old_index->hash[i + 1];

246

--packed_entry;

247

while (packed_entry >= old_index->hash[i]

248

&& packed_entry->ptr == NULL) {

249

--packed_entry;

250

}

251

++packed_entry;

252

}

253

if (packed_entry >= old_index->hash[i+1]

254

|| packed_entry->ptr != NULL) {

255

/* There are no free spots here :( */

256

fit_in_old = 0;

257

break;

258

}

259

/* We found an empty spot to put this entry

260

* Copy it over, and remove it from the linked list, just in

261

* case we end up running out of room later.

262

263

*packed_entry++ = entry->entry;

264

assert(entry == hash[i]);

265

hash[i] = entry->next;

266

copied_count += 1;

267

old_index->num_entries++;

268

}

269

if (!fit_in_old) {

270

break;

271

}

272

}

273

}

274

if (old_index) {

275

if (fit_in_old) {

276

// fprintf(stderr, "Fit all %d entries into old index\n",

277

// copied_count);

278

/* No need to allocate a new buffer */

279

return NULL;

280

} else {

281

// fprintf(stderr, "Fit only %d entries into old index,"

282

// " reallocating\n", copied_count);

283

}

284

}

285

286

* Now create the packed index in array form

287

* rather than linked lists.

288

* Leave a 2-entry gap for inserting more entries between the groups

289

290

memsize = sizeof(*index)

291

+ sizeof(*packed_hash) * (hsize+1)

292

+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);

293

mem = malloc(memsize);

294

if (!mem) {

295

return NULL;

296

}

297

298

index = mem;

299

index->memsize = memsize;

300

index->hash_mask = hmask;

301

index->num_entries = num_entries;

302

if (old_index) {

303

if (hmask < old_index->hash_mask) {

304

fprintf(stderr, "hash mask was shrunk %x => %x\n",

305

old_index->hash_mask, hmask);

306

}

307

assert(hmask >= old_index->hash_mask);

308

}

309

310

mem = index->hash;

311

packed_hash = mem;

312

mem = packed_hash + (hsize+1);

313

packed_entry = mem;

314

315

for (i = 0; i < hsize; i++) {

316

317

* Coalesce all entries belonging to one linked list

318

* into consecutive array entries.

319

320

packed_hash[i] = packed_entry;

321

/* Old comes earlier as a source, so it always comes first in a given

322

* hash bucket.

323

324

if (old_index) {

325

/* Could we optimize this to use memcpy when hmask ==

326

* old_index->hash_mask? Would it make any real difference?

327

328

j = i & old_index->hash_mask;

329

copy_from = old_index->hash[j];

330

for (old_entry = old_index->hash[j];

331

old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;

332

old_entry++) {

333

if ((old_entry->val & hmask) == i) {

334

*packed_entry++ = *old_entry;

335

}

336

}

337

}

338

for (entry = hash[i]; entry; entry = entry->next) {

339

*packed_entry++ = entry->entry;

340

}

341

/* TODO: At this point packed_entry - packed_hash[i] is the number of

342

* records that we have inserted into this hash bucket.

343

* We should *really* consider doing some limiting along the

344

* lines of limit_hash_buckets() to avoid pathological behavior.

345

346

/* Now add extra 'NULL' entries that we can use for future expansion. */

347

for (j = 0; j < EXTRA_NULLS; ++j ) {

348

*packed_entry++ = null_entry;

349

}

350

}

351

352

/* Sentinel value to indicate the length of the last hash bucket */

353

packed_hash[hsize] = packed_entry;

354

355

if (packed_entry - (struct index_entry *)mem

356

!= num_entries + hsize*EXTRA_NULLS) {

357

fprintf(stderr, "We expected %d entries, but created %d\n",

358

num_entries + hsize*EXTRA_NULLS,

359

(int)(packed_entry - (struct index_entry*)mem));

360

}

361

assert(packed_entry - (struct index_entry *)mem

362

== num_entries + hsize*EXTRA_NULLS);

363

index->last_entry = (packed_entry - 1);

364

return index;

365

}

366

367

368

struct delta_index *

369

create_delta_index(const struct source_info *src,

370

struct delta_index *old)

371

{

372

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

373

unsigned int total_num_entries;

374

const unsigned char *data, *buffer;

375

struct delta_index *index;

376

struct unpacked_index_entry *entry, **hash;

377

void *mem;

378

unsigned long memsize;

379

380

if (!src->buf || !src->size)

381

return NULL;

382

buffer = src->buf;

383

384

/* Determine index hash size. Note that indexing skips the

385

first byte to allow for optimizing the Rabin's polynomial

386

initialization in create_delta(). */

387

num_entries = (src->size - 1) / RABIN_WINDOW;

388

if (old != NULL)

389

total_num_entries = num_entries + old->num_entries;

390

else

391

total_num_entries = num_entries;

392

hsize = total_num_entries / 4;

393

for (i = 4; (1u << i) < hsize && i < 31; i++);

394

hsize = 1 << i;

395

hmask = hsize - 1;

396

if (old && old->hash_mask > hmask) {

397

hmask = old->hash_mask;

398

hsize = hmask + 1;

399

}

400

401

/* allocate lookup index */

402

memsize = sizeof(*hash) * hsize +

403

sizeof(*entry) * total_num_entries;

404

mem = malloc(memsize);

405

if (!mem)

406

return NULL;

407

hash = mem;

408

mem = hash + hsize;

409

entry = mem;

410

411

memset(hash, 0, hsize * sizeof(*hash));

412

413

/* allocate an array to count hash num_entries */

414

hash_count = calloc(hsize, sizeof(*hash_count));

415

if (!hash_count) {

416

free(hash);

417

return NULL;

418

}

419

420

/* then populate the index for the new data */

421

prev_val = ~0;

422

for (data = buffer + num_entries * RABIN_WINDOW - RABIN_WINDOW;

423

data >= buffer;

424

data -= RABIN_WINDOW) {

425

unsigned int val = 0;

426

for (i = 1; i <= RABIN_WINDOW; i++)

427

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

428

if (val == prev_val) {

429

/* keep the lowest of consecutive identical blocks */

430

entry[-1].entry.ptr = data + RABIN_WINDOW;

431

--num_entries;

432

--total_num_entries;

433

} else {

434

prev_val = val;

435

i = val & hmask;

436

entry->entry.ptr = data + RABIN_WINDOW;

437

entry->entry.val = val;

438

entry->entry.src = src;

439

entry->next = hash[i];

440

hash[i] = entry++;

441

hash_count[i]++;

442

}

443

}

444

/* TODO: It would be nice to limit_hash_buckets at a better time. */

445

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

446

total_num_entries);

447

free(hash_count);

448

if (old) {

449

old->last_src = src;

450

}

451

index = pack_delta_index(hash, hsize, total_num_entries, old);

452

free(hash);

453

if (!index) {

454

return NULL;

455

}

456

index->last_src = src;

457

return index;

458

}

459

460

/* Take some entries, and put them into a custom hash.

461

* @param entries A list of entries, sorted by position in file

462

* @param num_entries Length of entries

463

* @param out_hsize The maximum size of the hash, the final size will be

464

* returned here

465

466

struct index_entry_linked_list **

467

_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,

468

unsigned int hsize)

469

{

470

unsigned int hash_offset, hmask, memsize;

471

struct index_entry *entry, *last_entry;

472

struct index_entry_linked_list *out_entry, **hash;

473

void *mem;

474

475

hmask = hsize - 1;

476

477

memsize = sizeof(*hash) * hsize +

478

sizeof(*out_entry) * num_entries;

479

mem = malloc(memsize);

480

if (!mem)

481

return NULL;

482

hash = mem;

483

mem = hash + hsize;

484

out_entry = mem;

485

486

memset(hash, 0, sizeof(*hash)*(hsize+1));

487

488

/* We know that entries are in the order we want in the output, but they

489

* aren't "grouped" by hash bucket yet.

490

491

last_entry = entries + num_entries;

492

for (entry = entries + num_entries - 1; entry >= entries; --entry) {

493

hash_offset = entry->val & hmask;

494

out_entry->p_entry = entry;

495

out_entry->next = hash[hash_offset];

496

/* TODO: Remove entries that have identical vals, or at least filter

497

* the map a little bit.

498

* if (hash[i] != NULL) {

499

* }

500

501

hash[hash_offset] = out_entry;

502

++out_entry;

503

}

504

return hash;

505

}

506

507

508

struct delta_index *

509

create_index_from_old_and_new_entries(const struct delta_index *old_index,

510

struct index_entry *entries,

511

unsigned int num_entries)

512

{

513

unsigned int i, j, hsize, hmask, total_num_entries;

514

struct delta_index *index;

515

struct index_entry *entry, *packed_entry, **packed_hash;

516

struct index_entry *last_entry, null_entry = {0};

517

void *mem;

518

unsigned long memsize;

519

struct index_entry_linked_list *unpacked_entry, **mini_hash;

520

521

/* Determine index hash size. Note that indexing skips the

522

first byte to allow for optimizing the Rabin's polynomial

523

initialization in create_delta(). */

524

total_num_entries = num_entries + old_index->num_entries;

525

hsize = total_num_entries / 4;

526

for (i = 4; (1u << i) < hsize && i < 31; i++);

527

hsize = 1 << i;

528

if (hsize < old_index->hash_mask) {

529

/* For some reason, there was a code path that would actually *shrink*

530

* the hash size. This screws with some later code, and in general, I

531

* think it better to make the hash bigger, rather than smaller. So

532

* we'll just force the size here.

533

* Possibly done by create_delta_index running into a

534

* limit_hash_buckets call, that ended up transitioning across a

535

* power-of-2. The cause isn't 100% clear, though.

536

537

hsize = old_index->hash_mask + 1;

538

}

539

hmask = hsize - 1;

540

// fprintf(stderr, "resizing index to insert %d entries into array"

541

// " with %d entries: %x => %x\n",

542

// num_entries, old_index->num_entries, old_index->hash_mask, hmask);

543

544

memsize = sizeof(*index)

545

+ sizeof(*packed_hash) * (hsize+1)

546

+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);

547

mem = malloc(memsize);

548

if (!mem) {

549

return NULL;

550

}

551

index = mem;

552

index->memsize = memsize;

553

index->hash_mask = hmask;

554

index->num_entries = total_num_entries;

555

index->last_src = old_index->last_src;

556

557

mem = index->hash;

558

packed_hash = mem;

559

mem = packed_hash + (hsize+1);

560

packed_entry = mem;

561

562

mini_hash = _put_entries_into_hash(entries, num_entries, hsize);

563

if (mini_hash == NULL) {

564

free(index);

565

return NULL;

566

}

567

last_entry = entries + num_entries;

568

for (i = 0; i < hsize; i++) {

569

570

* Coalesce all entries belonging in one hash bucket

571

* into consecutive array entries.

572

* The entries in old_index all come before 'entries'.

573

574

packed_hash[i] = packed_entry;

575

/* Copy any of the old entries across */

576

/* Would we rather use memcpy? */

577

if (hmask == old_index->hash_mask) {

578

for (entry = old_index->hash[i];

579

entry < old_index->hash[i+1] && entry->ptr != NULL;

580

++entry) {

581

assert((entry->val & hmask) == i);

582

*packed_entry++ = *entry;

583

}

584

} else {

585

/* If we resized the index from this action, all of the old values

586

* will be found in the previous location, but they will end up

587

* spread across the new locations.

588

589

j = i & old_index->hash_mask;

590

for (entry = old_index->hash[j];

591

entry < old_index->hash[j+1] && entry->ptr != NULL;

592

++entry) {

593

assert((entry->val & old_index->hash_mask) == j);

594

if ((entry->val & hmask) == i) {

595

/* Any entries not picked up here will be picked up on the

596

* next pass.

597

598

*packed_entry++ = *entry;

599

}

600

}

601

}

602

/* Now see if we need to insert any of the new entries.

603

* Note that loop ends up O(hsize*num_entries), so we expect that

604

* num_entries is always small.

605

* We also help a little bit by collapsing the entry range when the

606

* endpoints are inserted. However, an alternative would be to build a

607

* quick hash lookup for just the new entries.

608

* Testing shows that this list can easily get up to about 100

609

* entries, the tradeoff is a malloc, 1 pass over the entries, copying

610

* them into a sorted buffer, and a free() when done,

611

612

for (unpacked_entry = mini_hash[i];

613

unpacked_entry;

614

unpacked_entry = unpacked_entry->next) {

615

assert((unpacked_entry->p_entry->val & hmask) == i);

616

*packed_entry++ = *(unpacked_entry->p_entry);

617

}

618

/* Now insert some extra nulls */

619

for (j = 0; j < EXTRA_NULLS; ++j) {

620

*packed_entry++ = null_entry;

621

}

622

}

623

free(mini_hash);

624

625

/* Sentinel value to indicate the length of the last hash bucket */

626

packed_hash[hsize] = packed_entry;

627

628

if ((packed_entry - (struct index_entry *)mem)

629

!= (total_num_entries + hsize*EXTRA_NULLS)) {

630

fprintf(stderr, "We expected %d entries, but created %d\n",

631

total_num_entries + hsize*EXTRA_NULLS,

632

(int)(packed_entry - (struct index_entry*)mem));

633

fflush(stderr);

634

}

635

assert((packed_entry - (struct index_entry *)mem)

636

== (total_num_entries + hsize * EXTRA_NULLS));

637

index->last_entry = (packed_entry - 1);

638

return index;

639

}

640

641

642

void

643

get_text(char buff[128], const unsigned char *ptr)

644

{

645

unsigned int i;

646

const unsigned char *start;

647

unsigned char cmd;

648

start = (ptr-RABIN_WINDOW-1);

649

cmd = *(start);

650

if (cmd < 0x80) {// This is likely to be an insert instruction

651

if (cmd < RABIN_WINDOW) {

652

cmd = RABIN_WINDOW;

653

}

654

} else {

655

/* This was either a copy [should never be] or it

656

* was a longer insert so the insert start happened at 16 more

657

* bytes back.

658

659

cmd = RABIN_WINDOW + 1;

660

}

661

if (cmd > 60) {

662

cmd = 60; /* Be friendly to 80char terms */

663

}

664

/* Copy the 1 byte command, and 4 bytes after the insert */

665

cmd += 5;

666

memcpy(buff, start, cmd);

667

buff[cmd] = 0;

668

for (i = 0; i < cmd; ++i) {

669

if (buff[i] == '\n') {

670

buff[i] = 'N';

671

} else if (buff[i] == '\t') {

672

buff[i] = 'T';

673

}

674

}

675

}

676

677

struct delta_index *

678

create_delta_index_from_delta(const struct source_info *src,

679

struct delta_index *old_index)

680

{

681

unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;

682

unsigned int hash_offset;

683

const unsigned char *data, *buffer, *top;

684

unsigned char cmd;

685

struct delta_index *new_index;

686

struct index_entry *entry, *entries, *old_entry;

687

688

if (!src->buf || !src->size)

689

return NULL;

690

buffer = src->buf;

691

top = buffer + src->size;

692

693

/* Determine index hash size. Note that indexing skips the

694

first byte to allow for optimizing the Rabin's polynomial

695

initialization in create_delta().

696

This computes the maximum number of entries that could be held. The

697

actual number will be recomputed during processing.

698

699

700

max_num_entries = (src->size - 1) / RABIN_WINDOW;

701

702

/* allocate an array to hold whatever entries we find */

703

entries = malloc(sizeof(*entry) * max_num_entries);

704

if (!entries) /* malloc failure */

705

return NULL;

706

707

/* then populate the index for the new data */

708

prev_val = ~0;

709

data = buffer;

710

/* source size */

711

get_delta_hdr_size(&data, top);

712

/* target size */

713

get_delta_hdr_size(&data, top);

714

entry = entries; /* start at the first slot */

715

num_entries = 0; /* calculate the real number of entries */

716

while (data < top) {

717

cmd = *data++;

718

if (cmd & 0x80) {

719

/* Copy instruction, skip it */

720

if (cmd & 0x01) data++;

721

if (cmd & 0x02) data++;

722

if (cmd & 0x04) data++;

723

if (cmd & 0x08) data++;

724

if (cmd & 0x10) data++;

725

if (cmd & 0x20) data++;

726

if (cmd & 0x40) data++;

727

} else if (cmd) {

728

/* Insert instruction, we want to index these bytes */

729

if (data + cmd > top) {

730

/* Invalid insert, not enough bytes in the delta */

731

break;

732

}

733

/* The create_delta code requires a match at least 4 characters

734

* (including only the last char of the RABIN_WINDOW) before it

735

* will consider it something worth copying rather than inserting.

736

* So we don't want to index anything that we know won't ever be a

737

* match.

738

739

for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,

740

data += RABIN_WINDOW) {

741

unsigned int val = 0;

742

for (i = 1; i <= RABIN_WINDOW; i++)

743

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

744

if (val != prev_val) {

745

/* Only keep the first of consecutive data */

746

prev_val = val;

747

num_entries++;

748

entry->ptr = data + RABIN_WINDOW;

749

entry->val = val;

750

entry->src = src;

751

entry++;

752

if (num_entries > max_num_entries) {

753

/* We ran out of entry room, something is really wrong

754

755

break;

756

}

757

}

758

}

759

/* Move the data pointer by whatever remainder is left */

760

data += cmd;

761

} else {

762

763

* cmd == 0 is reserved for future encoding

764

* extensions. In the mean time we must fail when

765

* encountering them (might be data corruption).

766

767

break;

768

}

769

}

770

if (data != top) {

771

/* Something was wrong with this delta */

772

free(entries);

773

return NULL;

774

}

775

if (num_entries == 0) {

776

/** Nothing to index **/

777

free(entries);

778

return NULL;

779

}

780

assert(old_index != NULL);

781

old_index->last_src = src;

782

/* See if we can fill in these values into the holes in the array */

783

entry = entries;

784

num_inserted = 0;

785

for (; num_entries > 0; --num_entries, ++entry) {

786

hash_offset = (entry->val & old_index->hash_mask);

787

/* The basic structure is a hash => packed_entries that fit in that

788

* hash bucket. Things are structured such that the hash-pointers are

789

* strictly ordered. So we start by pointing to the next pointer, and

790

* walk back until we stop getting NULL targets, and then go back

791

* forward. If there are no NULL targets, then we know because

792

* entry->ptr will not be NULL.

793

794

old_entry = old_index->hash[hash_offset + 1];

795

old_entry--;

796

while (old_entry->ptr == NULL

797

&& old_entry >= old_index->hash[hash_offset]) {

798

old_entry--;

799

}

800

old_entry++;

801

if (old_entry->ptr != NULL

802

|| old_entry >= old_index->hash[hash_offset + 1]) {

803

/* There is no room for this entry, we have to resize */

804

// char buff[128];

805

// get_text(buff, entry->ptr);

806

// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",

807

// hash_offset, entry->val, buff);

808

// for (old_entry = old_index->hash[hash_offset];

809

// old_entry < old_index->hash[hash_offset+1];

810

// ++old_entry) {

811

// get_text(buff, old_entry->ptr);

812

// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",

813

// (int)(old_entry - old_index->hash[hash_offset]),

814

// old_entry->val, old_entry->ptr, buff);

815

// }

816

break;

817

}

818

num_inserted++;

819

*old_entry = *entry;

820

/* For entries which we *do* manage to insert into old_index, we don't

821

* want them double copied into the final output.

822

823

old_index->num_entries++;

824

}

825

if (num_entries > 0) {

826

/* We couldn't fit the new entries into the old index, so allocate a

827

* new one, and fill it with stuff.

828

829

// fprintf(stderr, "inserted %d before resize\n", num_inserted);

830

new_index = create_index_from_old_and_new_entries(old_index,

831

entry, num_entries);

832

} else {

833

new_index = NULL;

834

// fprintf(stderr, "inserted %d without resizing\n", num_inserted);

835

}

836

free(entries);

837

return new_index;

838

}

839

840

void free_delta_index(struct delta_index *index)

841

{

842

free(index);

843

}

844

845

unsigned long sizeof_delta_index(struct delta_index *index)

846

{

847

if (index)

848

return index->memsize;

849

else

850

return 0;

851

}

852

853

854

* The maximum size for any opcode sequence, including the initial header

855

* plus Rabin window plus biggest copy.

856

857

#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)

858

859

void *

860

create_delta(const struct delta_index *index,

861

const void *trg_buf, unsigned long trg_size,

862

unsigned long *delta_size, unsigned long max_size)

863

{

864

unsigned int i, outpos, outsize, moff, msize, val;

865

const struct source_info *msource;

866

int inscnt;

867

const unsigned char *ref_data, *ref_top, *data, *top;

868

unsigned char *out;

869

unsigned long source_size;

870

871

if (!trg_buf || !trg_size)

872

return NULL;

873

if (index == NULL)

874

return NULL;

875

876

outpos = 0;

877

outsize = 8192;

878

if (max_size && outsize >= max_size)

879

outsize = max_size + MAX_OP_SIZE + 1;

880

out = malloc(outsize);

881

if (!out)

882

return NULL;

883

884

/* store reference buffer size */

885

source_size = index->last_src->size + index->last_src->agg_offset;

886

i = source_size;

887

while (i >= 0x80) {

888

out[outpos++] = i | 0x80;

889

i >>= 7;

890

}

891

out[outpos++] = i;

892

893

/* store target buffer size */

894

i = trg_size;

895

while (i >= 0x80) {

896

out[outpos++] = i | 0x80;

897

i >>= 7;

898

}

899

out[outpos++] = i;

900

901

data = trg_buf;

902

top = (const unsigned char *) trg_buf + trg_size;

903

904

/* Start the matching by filling out with a simple 'insert' instruction, of

905

* the first RABIN_WINDOW bytes of the input.

906

907

outpos++; /* leave a byte for the insert command */

908

val = 0;

909

for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {

910

out[outpos++] = *data;

911

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

912

}

913

/* we are now setup with an insert of 'i' bytes and val contains the RABIN

914

* hash for those bytes, and data points to the RABIN_WINDOW+1 byte of

915

* input.

916

917

inscnt = i;

918

919

moff = 0;

920

msize = 0;

921

msource = NULL;

922

while (data < top) {

923

if (msize < 4096) {

924

/* we don't have a 'worthy enough' match yet, so let's look for

925

* one.

926

927

struct index_entry *entry;

928

/* Shift the window by one byte. */

929

val ^= U[data[-RABIN_WINDOW]];

930

val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];

931

i = val & index->hash_mask;

932

/* TODO: When using multiple indexes like this, the hash tables

933

* mapping val => index_entry become less efficient.

934

* You end up getting a lot more collisions in the hash,

935

* which doesn't actually lead to a entry->val match.

936

937

for (entry = index->hash[i];

938

entry < index->hash[i+1] && entry->src != NULL;

939

entry++) {

940

const unsigned char *ref;

941

const unsigned char *src;

942

unsigned int ref_size;

943

if (entry->val != val)

944

continue;

945

ref = entry->ptr;

946

src = data;

947

ref_data = entry->src->buf;

948

ref_top = ref_data + entry->src->size;

949

ref_size = ref_top - ref;

950

/* ref_size is the longest possible match that we could make

951

* here. If ref_size <= msize, then we know that we cannot

952

* match more bytes with this location that we have already

953

* matched.

954

955

if (ref_size > top - src)

956

ref_size = top - src;

957

if (ref_size <= msize)

958

break;

959

/* See how many bytes actually match at this location. */

960

while (ref_size-- && *src++ == *ref)

961

ref++;

962

if (msize < ref - entry->ptr) {

963

/* this is our best match so far */

964

msize = ref - entry->ptr;

965

msource = entry->src;

966

moff = entry->ptr - ref_data;

967

if (msize >= 4096) /* good enough */

968

break;

969

}

970

}

971

}

972

973

if (msize < 4) {

974

/* The best match right now is less than 4 bytes long. So just add

975

* the current byte to the insert instruction. Increment the insert

976

* counter, and copy the byte of data into the output buffer.

977

978

if (!inscnt)

979

outpos++;

980

out[outpos++] = *data++;

981

inscnt++;

982

if (inscnt == 0x7f) {

983

/* We have a max length insert instruction, finalize it in the

984

* output.

985

986

out[outpos - inscnt - 1] = inscnt;

987

inscnt = 0;

988

}

989

msize = 0;

990

} else {

991

unsigned int left;

992

unsigned char *op;

993

994

if (inscnt) {

995

ref_data = msource->buf;

996

while (moff && ref_data[moff-1] == data[-1]) {

997

/* we can match one byte back */

998

msize++;

999

moff--;

1000

data--;

1001

outpos--;

1002

if (--inscnt)

1003

continue;

1004

outpos--; /* remove count slot */

1005

inscnt--; /* make it -1 */

1006

break;

1007

}

1008

out[outpos - inscnt - 1] = inscnt;

1009

inscnt = 0;

1010

}

1011

1012

/* A copy op is currently limited to 64KB (pack v2) */

1013

left = (msize < 0x10000) ? 0 : (msize - 0x10000);

1014

msize -= left;

1015

1016

op = out + outpos++;

1017

i = 0x80;

1018

1019

/* moff is the offset in the local structure, for encoding, we need

1020

* to push it into the global offset

1021

1022

assert(moff < msource->size);

1023

moff += msource->agg_offset;

1024

assert(moff + msize <= source_size);

1025

if (moff & 0x000000ff)

1026

out[outpos++] = moff >> 0, i |= 0x01;

1027

if (moff & 0x0000ff00)

1028

out[outpos++] = moff >> 8, i |= 0x02;

1029

if (moff & 0x00ff0000)

1030

out[outpos++] = moff >> 16, i |= 0x04;

1031

if (moff & 0xff000000)

1032

out[outpos++] = moff >> 24, i |= 0x08;

1033

/* Put it back into local coordinates, in case we have multiple

1034

* copies in a row.

1035

1036

moff -= msource->agg_offset;

1037

1038

if (msize & 0x00ff)

1039

out[outpos++] = msize >> 0, i |= 0x10;

1040

if (msize & 0xff00)

1041

out[outpos++] = msize >> 8, i |= 0x20;

1042

1043

*op = i;

1044

1045

data += msize;

1046

moff += msize;

1047

msize = left;

1048

1049

if (msize < 4096) {

1050

int j;

1051

val = 0;

1052

for (j = -RABIN_WINDOW; j < 0; j++)

1053

val = ((val << 8) | data[j])

1054

^ T[val >> RABIN_SHIFT];

1055

}

1056

}

1057

1058

if (outpos >= outsize - MAX_OP_SIZE) {

1059

void *tmp = out;

1060

outsize = outsize * 3 / 2;

1061

if (max_size && outsize >= max_size)

1062

outsize = max_size + MAX_OP_SIZE + 1;

1063

if (max_size && outpos > max_size)

1064

break;

1065

out = realloc(out, outsize);

1066

if (!out) {

1067

free(tmp);

1068

return NULL;

1069

}

1070

}

1071

}

1072

1073

if (inscnt)

1074

out[outpos - inscnt - 1] = inscnt;

1075

1076

if (max_size && outpos > max_size) {

1077

free(out);

1078

return NULL;

1079

}

1080

1081

*delta_size = outpos;

1082

return out;

1083

}

1084

1085

/* vim: et ts=4 sw=4 sts=4

1086

Older »