~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/diff-delta.c

Committer: John Arbash Meinel
Date: 2009-03-20 03:21:07 UTC
mfrom: (3735.33.18 gc_delta_index_room)
mto: (3735.36.1 brisbane-core) (4194.1.1 bbc) (4216.5.1 bbc) (4265.1.1 bbc-merge)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090320032107-bm9wg421rtcacy5i

Merge the updates to the groupcompress DeltaIndex.

files modified:
bzrlib/delta.h

bzrlib/diff-delta.c

bzrlib/tests/test__groupcompress_pyx.py

Show diffs side-by-side

added added

removed removed

bzrlib/diff-delta.c

* published by the Free Software Foundation.

#include <stdio.h>

#include "delta.h"

#include <stdlib.h>

#include <string.h>

#define RABIN_SHIFT 23

#define RABIN_WINDOW 16

/* The hash map is sized to put 4 entries per bucket, this gives us ~even room

* for more data. Tweaking this number above 4 doesn't seem to help much,

* anyway.

#define EXTRA_NULLS 4

static const unsigned int T[256] = {

0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,

0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,

121

129

unsigned int val;

122

130

};

123

131

132

struct index_entry_linked_list {

133

struct index_entry *p_entry;

134

struct index_entry_linked_list *next;

135

};

136

124

137

struct unpacked_index_entry {

125

138

struct index_entry entry;

126

139

struct unpacked_index_entry *next;

197

210

198

211

static struct delta_index *

199

212

pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,

200

unsigned int num_entries)

213

unsigned int num_entries, struct delta_index *old_index)

201

214

{

202

unsigned int i, memsize;

215

unsigned int i, j, hmask, memsize, fit_in_old, copied_count;

203

216

struct unpacked_index_entry *entry;

204

217

struct delta_index *index;

205

struct index_entry *packed_entry, **packed_hash;

218

struct index_entry *packed_entry, **packed_hash, *old_entry, *copy_from;

219

struct index_entry null_entry = {0};

206

220

void *mem;

221

222

hmask = hsize - 1;

223

224

// if (old_index) {

225

// fprintf(stderr, "Packing %d entries into %d for total of %d entries"

226

// " %x => %x\n",

227

// num_entries - old_index->num_entries,

228

// old_index->num_entries, num_entries,

229

// old_index->hash_mask, hmask);

230

// } else {

231

// fprintf(stderr, "Packing %d entries into a new index\n",

232

// num_entries);

233

// }

234

/* First, see if we can squeeze the new items into the existing structure.

235

236

fit_in_old = 0;

237

copied_count = 0;

238

if (old_index && old_index->hash_mask == hmask) {

239

fit_in_old = 1;

240

for (i = 0; i < hsize; ++i) {

241

packed_entry = NULL;

242

for (entry = hash[i]; entry; entry = entry->next) {

243

if (packed_entry == NULL) {

244

/* Find the last open spot */

245

packed_entry = old_index->hash[i + 1];

246

--packed_entry;

247

while (packed_entry >= old_index->hash[i]

248

&& packed_entry->ptr == NULL) {

249

--packed_entry;

250

}

251

++packed_entry;

252

}

253

if (packed_entry >= old_index->hash[i+1]

254

|| packed_entry->ptr != NULL) {

255

/* There are no free spots here :( */

256

fit_in_old = 0;

257

break;

258

}

259

/* We found an empty spot to put this entry

260

* Copy it over, and remove it from the linked list, just in

261

* case we end up running out of room later.

262

263

*packed_entry++ = entry->entry;

264

assert(entry == hash[i]);

265

hash[i] = entry->next;

266

copied_count += 1;

267

old_index->num_entries++;

268

}

269

if (!fit_in_old) {

270

break;

271

}

272

}

273

}

274

if (old_index) {

275

if (fit_in_old) {

276

// fprintf(stderr, "Fit all %d entries into old index\n",

277

// copied_count);

278

/* No need to allocate a new buffer */

279

return NULL;

280

} else {

281

// fprintf(stderr, "Fit only %d entries into old index,"

282

// " reallocating\n", copied_count);

283

}

284

}

207

285

208

286

* Now create the packed index in array form

209

287

* rather than linked lists.

288

* Leave a 2-entry gap for inserting more entries between the groups

210

289

211

290

memsize = sizeof(*index)

212

291

+ sizeof(*packed_hash) * (hsize+1)

213

+ sizeof(*packed_entry) * num_entries;

292

+ sizeof(*packed_entry) * (num_entries + hsize * EXTRA_NULLS);

214

293

mem = malloc(memsize);

215

294

if (!mem) {

216

295

return NULL;

218

297

219

298

index = mem;

220

299

index->memsize = memsize;

221

index->hash_mask = hsize - 1;

300

index->hash_mask = hmask;

222

301

index->num_entries = num_entries;

223

224

mem = index->hash;

225

packed_hash = mem;

226

mem = packed_hash + (hsize+1);

227

packed_entry = mem;

228

229

for (i = 0; i < hsize; i++) {

230

231

* Coalesce all entries belonging to one linked list

232

* into consecutive array entries.

233

234

packed_hash[i] = packed_entry;

235

for (entry = hash[i]; entry; entry = entry->next)

236

*packed_entry++ = entry->entry;

237

}

238

239

/* Sentinel value to indicate the length of the last hash bucket */

240

packed_hash[hsize] = packed_entry;

241

242

assert(packed_entry - (struct index_entry *)mem == num_entries);

243

index->last_entry = (packed_entry - 1);

244

return index;

245

}

246

247

void include_entries_from_index(struct unpacked_index_entry **hash,

248

unsigned int *hash_count,

249

unsigned int hsize,

250

struct unpacked_index_entry *entry,

251

const struct delta_index *old)

252

{

253

unsigned int i, hmask, masked_val;

254

struct index_entry *old_entry;

255

256

hmask = hsize - 1;

257

/* Iterate backwards through the existing entries

258

* This is so that matches early in the file end up being the first pointer

259

* in the linked list.

260

* TODO: We know that all old entries are going to occur before the new

261

* entries, and that we are going to follow this with a limit and

262

* then pack step. There is probably a way we could optimize this

263

* step, so that we wouldn't have to copy everything into a new

264

* buffer, and then copy everything again into yet another buffer.

265

266

for (old_entry = old->last_entry, i = 0; i < old->num_entries;

267

i++, old_entry--) {

268

masked_val = old_entry->val & hmask;

269

entry->entry = *old_entry;

270

entry->next = hash[masked_val];

271

hash[masked_val] = entry++;

272

hash_count[masked_val]++;

273

}

274

}

275

276

struct delta_index *

277

create_index_from_old_and_hash(const struct delta_index *old,

278

struct unpacked_index_entry **hash,

279

unsigned int hsize,

280

unsigned int num_entries)

281

{

282

unsigned int i, memsize, total_num_entries;

283

struct unpacked_index_entry *entry;

284

struct delta_index *index;

285

struct index_entry *packed_entry, **packed_hash;

286

struct index_entry *copy_from_start, *copy_to_start;

287

size_t total_copy, to_copy;

288

unsigned int num_ops;

289

unsigned int bytes_copied;

290

void *mem;

291

292

* Now create the packed index in array form

293

* rather than linked lists.

294

295

total_num_entries = num_entries + old->num_entries;

296

memsize = sizeof(*index)

297

+ sizeof(*packed_hash) * (hsize+1)

298

+ sizeof(*packed_entry) * total_num_entries;

299

mem = malloc(memsize);

300

if (!mem) {

301

return NULL;

302

}

303

304

index = mem;

305

index->memsize = memsize;

306

index->hash_mask = hsize - 1;

307

index->num_entries = total_num_entries;

308

309

mem = index->hash;

310

packed_hash = mem;

311

mem = packed_hash + (hsize+1);

312

packed_entry = mem;

313

314

total_copy = 0;

315

bytes_copied = 0;

316

num_ops = 0;

317

copy_from_start = copy_to_start = NULL;

318

for (i = 0; i < hsize; i++) {

319

320

* Coalesce all entries belonging to one linked list

321

* into consecutive array entries.

322

* The entries in old all come before the entries in hash.

323

324

packed_hash[i] = packed_entry;

325

to_copy = (old->hash[i+1] - old->hash[i]);

326

if (to_copy > 0) {

327

/* This next range can be copied wholesale. However, we will wait

328

* until we have to insert something from the new data before we

329

* copy everything.

330

* So for now, just reserve space for the copy.

302

if (old_index) {

303

if (hmask < old_index->hash_mask) {

304

fprintf(stderr, "hash mask was shrunk %x => %x\n",

305

old_index->hash_mask, hmask);

306

}

307

assert(hmask >= old_index->hash_mask);

308

}

309

310

mem = index->hash;

311

packed_hash = mem;

312

mem = packed_hash + (hsize+1);

313

packed_entry = mem;

314

315

for (i = 0; i < hsize; i++) {

316

317

* Coalesce all entries belonging to one linked list

318

* into consecutive array entries.

319

320

packed_hash[i] = packed_entry;

321

/* Old comes earlier as a source, so it always comes first in a given

322

* hash bucket.

323

324

if (old_index) {

325

/* Could we optimize this to use memcpy when hmask ==

326

* old_index->hash_mask? Would it make any real difference?

331

327

332

if (total_copy == 0) {

333

copy_from_start = old->hash[i];

334

copy_to_start = packed_entry;

328

j = i & old_index->hash_mask;

329

copy_from = old_index->hash[j];

330

for (old_entry = old_index->hash[j];

331

old_entry < old_index->hash[j + 1] && old_entry->ptr != NULL;

332

old_entry++) {

333

if ((old_entry->val & hmask) == i) {

334

*packed_entry++ = *old_entry;

335

}

335

336

}

336

packed_entry += to_copy;

337

total_copy += to_copy;

338

assert((packed_entry - copy_to_start) == total_copy);

339

assert((old->hash[i+1] - copy_from_start) == total_copy);

340

337

}

341

338

for (entry = hash[i]; entry; entry = entry->next) {

342

/* We have an entry to insert, so flush the copy buffer */

343

if (total_copy > 0) {

344

assert(copy_to_start != NULL);

345

assert(copy_from_start != NULL);

346

memcpy(copy_to_start, copy_from_start,

347

total_copy * sizeof(struct index_entry));

348

bytes_copied += (total_copy * sizeof(struct index_entry));

349

total_copy = 0;

350

copy_from_start = copy_to_start = NULL;

351

num_ops += 1;

352

}

353

339

*packed_entry++ = entry->entry;

354

340

}

355

}

356

if (total_copy > 0) {

357

assert(copy_to_start != NULL);

358

assert(copy_from_start != NULL);

359

memcpy(copy_to_start, copy_from_start,

360

total_copy * sizeof(struct index_entry));

361

bytes_copied += (total_copy * sizeof(struct index_entry));

362

num_ops += 1;

341

/* TODO: At this point packed_entry - packed_hash[i] is the number of

342

* records that we have inserted into this hash bucket.

343

* We should *really* consider doing some limiting along the

344

* lines of limit_hash_buckets() to avoid pathological behavior.

345

346

/* Now add extra 'NULL' entries that we can use for future expansion. */

347

for (j = 0; j < EXTRA_NULLS; ++j ) {

348

*packed_entry++ = null_entry;

349

}

363

350

}

364

351

365

352

/* Sentinel value to indicate the length of the last hash bucket */

366

353

packed_hash[hsize] = packed_entry;

367

354

368

assert(packed_entry - (struct index_entry *)mem == total_num_entries);

355

if (packed_entry - (struct index_entry *)mem

356

!= num_entries + hsize*EXTRA_NULLS) {

357

fprintf(stderr, "We expected %d entries, but created %d\n",

358

num_entries + hsize*EXTRA_NULLS,

359

(int)(packed_entry - (struct index_entry*)mem));

360

}

361

assert(packed_entry - (struct index_entry *)mem

362

== num_entries + hsize*EXTRA_NULLS);

369

363

index->last_entry = (packed_entry - 1);

370

364

return index;

371

365

}

372

366

373

367

374

struct delta_index * create_delta_index(const struct source_info *src,

375

const struct delta_index *old)

368

struct delta_index *

369

create_delta_index(const struct source_info *src,

370

struct delta_index *old)

376

371

{

377

372

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

378

373

unsigned int total_num_entries;

398

393

for (i = 4; (1u << i) < hsize && i < 31; i++);

399

394

hsize = 1 << i;

400

395

hmask = hsize - 1;

396

if (old && old->hash_mask > hmask) {

397

hmask = old->hash_mask;

398

hsize = hmask + 1;

399

}

401

400

402

401

/* allocate lookup index */

403

402

memsize = sizeof(*hash) * hsize +

442

441

hash_count[i]++;

443

442

}

444

443

}

445

/* If we have an existing delta_index, we want to bring its info into the

446

* new structure. We assume that the existing structure's objects all occur

447

* before the new source, so they get first precedence in the index.

448

449

if (old != NULL)

450

include_entries_from_index(hash, hash_count, hsize, entry, old);

451

444

/* TODO: It would be nice to limit_hash_buckets at a better time. */

452

445

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

453

446

total_num_entries);

454

447

free(hash_count);

455

index = pack_delta_index(hash, hsize, total_num_entries);

456

index->last_src = src;

448

if (old) {

449

old->last_src = src;

450

}

451

index = pack_delta_index(hash, hsize, total_num_entries, old);

457

452

free(hash);

458

453

if (!index) {

459

454

return NULL;

460

455

}

461

return index;

456

index->last_src = src;

457

return index;

458

}

459

460

/* Take some entries, and put them into a custom hash.

461

* @param entries A list of entries, sorted by position in file

462

* @param num_entries Length of entries

463

* @param out_hsize The maximum size of the hash, the final size will be

464

* returned here

465

466

struct index_entry_linked_list **

467

_put_entries_into_hash(struct index_entry *entries, unsigned int num_entries,

468

unsigned int hsize)

469

{

470

unsigned int hash_offset, hmask, memsize;

471

struct index_entry *entry, *last_entry;

472

struct index_entry_linked_list *out_entry, **hash;

473

void *mem;

474

475

hmask = hsize - 1;

476

477

memsize = sizeof(*hash) * hsize +

478

sizeof(*out_entry) * num_entries;

479

mem = malloc(memsize);

480

if (!mem)

481

return NULL;

482

hash = mem;

483

mem = hash + hsize;

484

out_entry = mem;

485

486

memset(hash, 0, sizeof(*hash)*(hsize+1));

487

488

/* We know that entries are in the order we want in the output, but they

489

* aren't "grouped" by hash bucket yet.

490

491

last_entry = entries + num_entries;

492

for (entry = entries + num_entries - 1; entry >= entries; --entry) {

493

hash_offset = entry->val & hmask;

494

out_entry->p_entry = entry;

495

out_entry->next = hash[hash_offset];

496

/* TODO: Remove entries that have identical vals, or at least filter

497

* the map a little bit.

498

* if (hash[i] != NULL) {

499

* }

500

501

hash[hash_offset] = out_entry;

502

++out_entry;

503

}

504

return hash;

505

}

506

507

508

struct delta_index *

509

create_index_from_old_and_new_entries(const struct delta_index *old_index,

510

struct index_entry *entries,

511

unsigned int num_entries)

512

{

513

unsigned int i, j, hsize, hmask, total_num_entries;

514

struct delta_index *index;

515

struct index_entry *entry, *packed_entry, **packed_hash;

516

struct index_entry *last_entry, null_entry = {0};

517

void *mem;

518

unsigned long memsize;

519

struct index_entry_linked_list *unpacked_entry, **mini_hash;

520

521

/* Determine index hash size. Note that indexing skips the

522

first byte to allow for optimizing the Rabin's polynomial

523

initialization in create_delta(). */

524

total_num_entries = num_entries + old_index->num_entries;

525

hsize = total_num_entries / 4;

526

for (i = 4; (1u << i) < hsize && i < 31; i++);

527

hsize = 1 << i;

528

if (hsize < old_index->hash_mask) {

529

/* For some reason, there was a code path that would actually *shrink*

530

* the hash size. This screws with some later code, and in general, I

531

* think it better to make the hash bigger, rather than smaller. So

532

* we'll just force the size here.

533

* Possibly done by create_delta_index running into a

534

* limit_hash_buckets call, that ended up transitioning across a

535

* power-of-2. The cause isn't 100% clear, though.

536

537

hsize = old_index->hash_mask + 1;

538

}

539

hmask = hsize - 1;

540

// fprintf(stderr, "resizing index to insert %d entries into array"

541

// " with %d entries: %x => %x\n",

542

// num_entries, old_index->num_entries, old_index->hash_mask, hmask);

543

544

memsize = sizeof(*index)

545

+ sizeof(*packed_hash) * (hsize+1)

546

+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);

547

mem = malloc(memsize);

548

if (!mem) {

549

return NULL;

550

}

551

index = mem;

552

index->memsize = memsize;

553

index->hash_mask = hmask;

554

index->num_entries = total_num_entries;

555

index->last_src = old_index->last_src;

556

557

mem = index->hash;

558

packed_hash = mem;

559

mem = packed_hash + (hsize+1);

560

packed_entry = mem;

561

562

mini_hash = _put_entries_into_hash(entries, num_entries, hsize);

563

if (mini_hash == NULL) {

564

free(index);

565

return NULL;

566

}

567

last_entry = entries + num_entries;

568

for (i = 0; i < hsize; i++) {

569

570

* Coalesce all entries belonging in one hash bucket

571

* into consecutive array entries.

572

* The entries in old_index all come before 'entries'.

573

574

packed_hash[i] = packed_entry;

575

/* Copy any of the old entries across */

576

/* Would we rather use memcpy? */

577

if (hmask == old_index->hash_mask) {

578

for (entry = old_index->hash[i];

579

entry < old_index->hash[i+1] && entry->ptr != NULL;

580

++entry) {

581

assert((entry->val & hmask) == i);

582

*packed_entry++ = *entry;

583

}

584

} else {

585

/* If we resized the index from this action, all of the old values

586

* will be found in the previous location, but they will end up

587

* spread across the new locations.

588

589

j = i & old_index->hash_mask;

590

for (entry = old_index->hash[j];

591

entry < old_index->hash[j+1] && entry->ptr != NULL;

592

++entry) {

593

assert((entry->val & old_index->hash_mask) == j);

594

if ((entry->val & hmask) == i) {

595

/* Any entries not picked up here will be picked up on the

596

* next pass.

597

598

*packed_entry++ = *entry;

599

}

600

}

601

}

602

/* Now see if we need to insert any of the new entries.

603

* Note that loop ends up O(hsize*num_entries), so we expect that

604

* num_entries is always small.

605

* We also help a little bit by collapsing the entry range when the

606

* endpoints are inserted. However, an alternative would be to build a

607

* quick hash lookup for just the new entries.

608

* Testing shows that this list can easily get up to about 100

609

* entries, the tradeoff is a malloc, 1 pass over the entries, copying

610

* them into a sorted buffer, and a free() when done,

611

612

for (unpacked_entry = mini_hash[i];

613

unpacked_entry;

614

unpacked_entry = unpacked_entry->next) {

615

assert((unpacked_entry->p_entry->val & hmask) == i);

616

*packed_entry++ = *(unpacked_entry->p_entry);

617

}

618

/* Now insert some extra nulls */

619

for (j = 0; j < EXTRA_NULLS; ++j) {

620

*packed_entry++ = null_entry;

621

}

622

}

623

free(mini_hash);

624

625

/* Sentinel value to indicate the length of the last hash bucket */

626

packed_hash[hsize] = packed_entry;

627

628

if ((packed_entry - (struct index_entry *)mem)

629

!= (total_num_entries + hsize*EXTRA_NULLS)) {

630

fprintf(stderr, "We expected %d entries, but created %d\n",

631

total_num_entries + hsize*EXTRA_NULLS,

632

(int)(packed_entry - (struct index_entry*)mem));

633

fflush(stderr);

634

}

635

assert((packed_entry - (struct index_entry *)mem)

636

== (total_num_entries + hsize * EXTRA_NULLS));

637

index->last_entry = (packed_entry - 1);

638

return index;

639

}

640

641

642

void

643

get_text(char buff[128], const unsigned char *ptr)

644

{

645

unsigned int i;

646

const unsigned char *start;

647

unsigned char cmd;

648

start = (ptr-RABIN_WINDOW-1);

649

cmd = *(start);

650

if (cmd < 0x80) {// This is likely to be an insert instruction

651

if (cmd < RABIN_WINDOW) {

652

cmd = RABIN_WINDOW;

653

}

654

} else {

655

/* This was either a copy [should never be] or it

656

* was a longer insert so the insert start happened at 16 more

657

* bytes back.

658

659

cmd = RABIN_WINDOW + 1;

660

}

661

if (cmd > 60) {

662

cmd = 60; /* Be friendly to 80char terms */

663

}

664

/* Copy the 1 byte command, and 4 bytes after the insert */

665

cmd += 5;

666

memcpy(buff, start, cmd);

667

buff[cmd] = 0;

668

for (i = 0; i < cmd; ++i) {

669

if (buff[i] == '\n') {

670

buff[i] = 'N';

671

} else if (buff[i] == '\t') {

672

buff[i] = 'T';

673

}

674

}

462

675

}

463

676

464

677

struct delta_index *

465

678

create_delta_index_from_delta(const struct source_info *src,

466

const struct delta_index *old)

679

struct delta_index *old_index)

467

680

{

468

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

469

unsigned int total_num_entries;

681

unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;

682

unsigned int hash_offset;

470

683

const unsigned char *data, *buffer, *top;

471

684

unsigned char cmd;

472

struct delta_index *index;

473

struct unpacked_index_entry *entry, **hash;

474

void *mem;

475

unsigned long memsize;

685

struct delta_index *new_index;

686

struct index_entry *entry, *entries, *old_entry;

476

687

477

688

if (!src->buf || !src->size)

478

689

return NULL;

486

697

actual number will be recomputed during processing.

487

698

488

699

489

num_entries = (src->size - 1) / RABIN_WINDOW;

490

if (old != NULL)

491

total_num_entries = num_entries + old->num_entries;

492

else

493

total_num_entries = num_entries;

494

hsize = total_num_entries / 4;

495

for (i = 4; (1u << i) < hsize && i < 31; i++);

496

hsize = 1 << i;

497

hmask = hsize - 1;

498

499

/* allocate lookup index */

500

memsize = sizeof(*hash) * hsize +

501

sizeof(*entry) * total_num_entries;

502

mem = malloc(memsize);

503

if (!mem)

504

return NULL;

505

hash = mem;

506

mem = hash + hsize;

507

entry = mem;

508

509

memset(hash, 0, hsize * sizeof(*hash));

510

511

/* allocate an array to count hash num_entries */

512

hash_count = calloc(hsize, sizeof(*hash_count));

513

if (!hash_count) {

514

free(hash);

515

return NULL;

516

}

700

max_num_entries = (src->size - 1) / RABIN_WINDOW;

701

702

/* allocate an array to hold whatever entries we find */

703

entries = malloc(sizeof(*entry) * max_num_entries);

704

if (!entries) /* malloc failure */

705

return NULL;

517

706

518

707

/* then populate the index for the new data */

519

/* The create_delta_index code starts at the end and works backward,

520

* because it is easier to update the entry pointers (you always update the

521

* head, rather than updating the tail).

522

* However, we can't walk the delta structure that way.

523

524

708

prev_val = ~0;

525

709

data = buffer;

526

710

/* source size */

527

711

get_delta_hdr_size(&data, top);

528

712

/* target size */

529

713

get_delta_hdr_size(&data, top);

714

entry = entries; /* start at the first slot */

530

715

num_entries = 0; /* calculate the real number of entries */

531

716

while (data < top) {

532

717

cmd = *data++;

545

730

/* Invalid insert, not enough bytes in the delta */

546

731

break;

547

732

}

548

for (; cmd > RABIN_WINDOW; cmd -= RABIN_WINDOW,

733

/* The create_delta code requires a match at least 4 characters

734

* (including only the last char of the RABIN_WINDOW) before it

735

* will consider it something worth copying rather than inserting.

736

* So we don't want to index anything that we know won't ever be a

737

* match.

738

739

for (; cmd > RABIN_WINDOW + 3; cmd -= RABIN_WINDOW,

549

740

data += RABIN_WINDOW) {

550

741

unsigned int val = 0;

551

742

for (i = 1; i <= RABIN_WINDOW; i++)

553

744

if (val != prev_val) {

554

745

/* Only keep the first of consecutive data */

555

746

prev_val = val;

556

i = val & hmask;

557

entry->entry.ptr = data + RABIN_WINDOW;

558

entry->entry.val = val;

559

entry->entry.src = src;

560

entry->next = NULL;

561

/* Now we have to insert this entry at the end of the hash

562

* map

563

564

if (hash[i] == NULL) {

565

hash[i] = entry;

566

} else {

567

struct unpacked_index_entry *this;

568

for (this = hash[i];

569

this->next != NULL;

570

this = this->next) { /* No action */ }

571

this->next = entry;

572

this = entry;

573

}

574

hash_count[i]++;

747

num_entries++;

748

entry->ptr = data + RABIN_WINDOW;

749

entry->val = val;

750

entry->src = src;

575

751

entry++;

576

num_entries++;

752

if (num_entries > max_num_entries) {

753

/* We ran out of entry room, something is really wrong

754

755

break;

756

}

577

757

}

578

758

}

579

759

/* Move the data pointer by whatever remainder is left */

589

769

}

590

770

if (data != top) {

591

771

/* Something was wrong with this delta */

592

free(hash);

593

free(hash_count);

772

free(entries);

594

773

return NULL;

595

774

}

596

775

if (num_entries == 0) {

597

776

/** Nothing to index **/

598

free(hash);

599

free(hash_count);

777

free(entries);

600

778

return NULL;

601

779

}

602

if (old != NULL) {

603

if (hmask == old->hash_mask) {

604

/* The total hash table size didn't change, which means that

605

* entries will end up in the same pages. We can do bulk-copying to

606

* get the final output

607

608

index = create_index_from_old_and_hash(old, hash, hsize,

609

num_entries);

610

free(hash_count);

611

free(hash);

612

if (!index) {

613

return NULL;

614

}

615

index->last_src = src;

616

return index;

617

} else {

618

total_num_entries = num_entries + old->num_entries;

619

include_entries_from_index(hash, hash_count, hsize, entry, old);

620

}

780

assert(old_index != NULL);

781

old_index->last_src = src;

782

/* See if we can fill in these values into the holes in the array */

783

entry = entries;

784

num_inserted = 0;

785

for (; num_entries > 0; --num_entries, ++entry) {

786

hash_offset = (entry->val & old_index->hash_mask);

787

/* The basic structure is a hash => packed_entries that fit in that

788

* hash bucket. Things are structured such that the hash-pointers are

789

* strictly ordered. So we start by pointing to the next pointer, and

790

* walk back until we stop getting NULL targets, and then go back

791

* forward. If there are no NULL targets, then we know because

792

* entry->ptr will not be NULL.

793

794

old_entry = old_index->hash[hash_offset + 1];

795

old_entry--;

796

while (old_entry->ptr == NULL

797

&& old_entry >= old_index->hash[hash_offset]) {

798

old_entry--;

799

}

800

old_entry++;

801

if (old_entry->ptr != NULL

802

|| old_entry >= old_index->hash[hash_offset + 1]) {

803

/* There is no room for this entry, we have to resize */

804

// char buff[128];

805

// get_text(buff, entry->ptr);

806

// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",

807

// hash_offset, entry->val, buff);

808

// for (old_entry = old_index->hash[hash_offset];

809

// old_entry < old_index->hash[hash_offset+1];

810

// ++old_entry) {

811

// get_text(buff, old_entry->ptr);

812

// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",

813

// (int)(old_entry - old_index->hash[hash_offset]),

814

// old_entry->val, old_entry->ptr, buff);

815

// }

816

break;

817

}

818

num_inserted++;

819

*old_entry = *entry;

820

/* For entries which we *do* manage to insert into old_index, we don't

821

* want them double copied into the final output.

822

823

old_index->num_entries++;

824

}

825

if (num_entries > 0) {

826

/* We couldn't fit the new entries into the old index, so allocate a

827

* new one, and fill it with stuff.

828

829

// fprintf(stderr, "inserted %d before resize\n", num_inserted);

830

new_index = create_index_from_old_and_new_entries(old_index,

831

entry, num_entries);

621

832

} else {

622

total_num_entries = num_entries;

623

}

624

625

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

626

total_num_entries);

627

free(hash_count);

628

index = pack_delta_index(hash, hsize, total_num_entries);

629

free(hash);

630

if (!index) {

631

return NULL;

632

}

633

index->last_src = src;

634

return index;

833

new_index = NULL;

834

// fprintf(stderr, "inserted %d without resizing\n", num_inserted);

835

}

836

free(entries);

837

return new_index;

635

838

}

636

839

637

840

void free_delta_index(struct delta_index *index)

731

934

* You end up getting a lot more collisions in the hash,

732

935

* which doesn't actually lead to a entry->val match.

733

936

734

for (entry = index->hash[i]; entry < index->hash[i+1];

937

for (entry = index->hash[i];

938

entry < index->hash[i+1] && entry->src != NULL;

735

939

entry++) {

736

940

const unsigned char *ref;

737

941

const unsigned char *src;

Older »