~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2009-03-18 21:50:56 UTC
mto: (3735.2.157 brisbane-core)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090318215056-dzx4j8ym5yhwh67b

The new layout is working.

Commenting out the debug info for now.
What I'm finding is a surprising number of repeated strings.
Basically, common strings of length < 20, which then end up
indexed by the RABIN code, but don't get copied in the output.
(because RABIN is a 16-byte match, but the copy command has
a minimum size of 20-bytes. Perhaps we need to change the
code so that it doesn't try to index <20 character inserts.
Or change the copy code so that it allows shorter copies.

files modified:
bzrlib/delta.h

bzrlib/diff-delta.c

bzrlib/tests/test__groupcompress_pyx.py

Show diffs side-by-side

added added

removed removed

bzrlib/diff-delta.c

* published by the Free Software Foundation.

#include <stdio.h>

#include "delta.h"

#include <stdlib.h>

#include <string.h>

248

250

/* Sentinel value to indicate the length of the last hash bucket */

249

251

packed_hash[hsize] = packed_entry;

250

252

253

if (packed_entry - (struct index_entry *)mem

254

!= num_entries + hsize*EXTRA_NULLS) {

255

// fprintf(stderr, "We expected %d entries, but created %d\n",

256

// num_entries + hsize*EXTRA_NULLS,

257

// (int)(packed_entry - (struct index_entry*)mem));

258

}

251

259

assert(packed_entry - (struct index_entry *)mem

252

260

== num_entries + hsize*EXTRA_NULLS);

253

261

index->last_entry = (packed_entry - 1);

285

293

}

286

294

}

287

295

296

288

297

struct delta_index *

289

298

create_index_from_old_and_hash(const struct delta_index *old,

290

299

struct unpacked_index_entry **hash,

502

511

return index;

503

512

}

504

513

514

515

struct delta_index *

516

create_index_from_old_and_new_entries(const struct delta_index *old_index,

517

struct index_entry *entries,

518

unsigned int num_entries)

519

{

520

unsigned int i, j, hsize, hmask, total_num_entries, found_null;

521

struct delta_index *index;

522

struct index_entry *entry, *packed_entry, **packed_hash;

523

struct index_entry *last_entry, null_entry = {0};

524

void *mem;

525

unsigned long memsize;

526

527

/* Determine index hash size. Note that indexing skips the

528

first byte to allow for optimizing the Rabin's polynomial

529

initialization in create_delta(). */

530

total_num_entries = num_entries + old_index->num_entries;

531

hsize = total_num_entries / 4;

532

for (i = 4; (1u << i) < hsize && i < 31; i++);

533

hsize = 1 << i;

534

if (hsize < old_index->hash_mask) {

535

/* For some reason, there was a code path that would actually *shrink*

536

* the hash size. This screws with some later code, and in general, I

537

* think it better to make the hash bigger, rather than smaller. So

538

* we'll just force the size here.

539

* Possibly done by create_delta_index running into a

540

* limit_hash_buckets call, that ended up transitioning across a

541

* power-of-2. The cause isn't 100% clear, though.

542

543

hsize = old_index->hash_mask + 1;

544

}

545

hmask = hsize - 1;

546

// fprintf(stderr, "resizing index to insert %d entries into array"

547

// " with %d entries: %x => %x\n",

548

// num_entries, old_index->num_entries, old_index->hash_mask, hmask);

549

550

memsize = sizeof(*index)

551

+ sizeof(*packed_hash) * (hsize+1)

552

+ sizeof(*packed_entry) * (total_num_entries + hsize*EXTRA_NULLS);

553

mem = malloc(memsize);

554

if (!mem) {

555

return NULL;

556

}

557

index = mem;

558

index->memsize = memsize;

559

index->hash_mask = hmask;

560

index->num_entries = total_num_entries;

561

index->last_src = old_index->last_src;

562

563

mem = index->hash;

564

packed_hash = mem;

565

mem = packed_hash + (hsize+1);

566

packed_entry = mem;

567

568

last_entry = entries + num_entries;

569

for (i = 0; i < hsize; i++) {

570

571

* Coalesce all entries belonging in one hash bucket

572

* into consecutive array entries.

573

* The entries in old_index all come before 'entries'.

574

575

packed_hash[i] = packed_entry;

576

/* Copy any of the old entries across */

577

/* Would we rather use memcpy? */

578

if (hmask == old_index->hash_mask) {

579

found_null = 0;

580

for (entry = old_index->hash[i];

581

entry < old_index->hash[i+1];

582

++entry) {

583

if (entry->ptr == NULL) {

584

found_null = 1;

585

continue;

586

} else if (found_null) {

587

fprintf(stderr, "Found a non-null entry after a"

588

" NULL entry!\n"

589

" new offset %x"

590

" w/ value %x\n",

591

i, entry->val);

592

}

593

assert((entry->val & hmask) == i);

594

*packed_entry++ = *entry;

595

}

596

} else {

597

/* If we resized the index from this action, all of the old values

598

* will be found in the previous location, but they will end up

599

* spread across the new locations.

600

601

j = i & old_index->hash_mask;

602

found_null = 0;

603

for (entry = old_index->hash[j];

604

entry < old_index->hash[j+1];

605

++entry) {

606

if (entry->ptr == NULL) {

607

found_null = 1;

608

continue;

609

} else if (found_null) {

610

fprintf(stderr, "Found a non-null entry after a"

611

" NULL entry!\n"

612

" offset %x for new offset %x"

613

" w/ value %x\n",

614

j, i, entry->val);

615

}

616

assert((entry->val & old_index->hash_mask) == j);

617

if (!(j == i || j + old_index->hash_mask + 1 == i)) {

618

fprintf(stderr, "Old hash entry %x"

619

" doesn't fit with new %x\n"

620

"old_mask: %x new_mask: %x\n",

621

j, i, old_index->hash_mask, hmask);

622

}

623

assert(j == i || j + old_index->hash_mask + 1 == i);

624

if ((entry->val & hmask) == i) {

625

/* Any entries not picked up here will be picked up on the

626

* next pass.

627

628

*packed_entry++ = *entry;

629

}

630

}

631

}

632

/* Now see if we need to insert any of the new entries.

633

* Note that loop ends up O(hsize*num_entries), so we expect that

634

* num_entries is always small.

635

* We also help a little bit by collapsing the entry range when the

636

* endpoints are inserted. However, an alternative would be to build a

637

* quick hash lookup for just the new entries.

638

* Testing shows that this list can easily get up to about 100

639

* entries, the tradeoff is a malloc, 1 pass over the entries, copying

640

* them into a sorted buffer, and a free() when done,

641

642

for (entry = entries; entry < last_entry; ++entry) {

643

if (entry->ptr != NULL && ((entry->val & hmask) == i)) {

644

/* A non-empty entry, which fits in this hash bucket */

645

*packed_entry++ = *entry;

646

*entry = null_entry;

647

if (entry == entries) {

648

/* This is the first entry in the list, skip all of the

649

* null entries from the front.

650

651

while (entries < last_entry && entries->ptr == 0) {

652

++entries;

653

/* we can also skip processing any of these entries in

654

* the current pass

655

656

++entry;

657

}

658

/* But we need to adjust back one, because the loop will

659

* increment as well.

660

661

--entry;

662

} else if (entry == (last_entry - 1)) {

663

/* This is the last entry, remove all the trailing null

664

* entries from the list.

665

666

--last_entry;

667

while (last_entry > entries && last_entry->ptr == NULL) {

668

--last_entry;

669

}

670

++last_entry;

671

}

672

}

673

}

674

/* Now insert some extra nulls */

675

for (j = 0; j < EXTRA_NULLS; ++j) {

676

*packed_entry++ = null_entry;

677

}

678

}

679

680

/* Sentinel value to indicate the length of the last hash bucket */

681

packed_hash[hsize] = packed_entry;

682

683

if ((packed_entry - (struct index_entry *)mem)

684

!= (total_num_entries + hsize*EXTRA_NULLS)) {

685

fprintf(stderr, "We expected %d entries, but created %d\n",

686

total_num_entries + hsize*EXTRA_NULLS,

687

(int)(packed_entry - (struct index_entry*)mem));

688

fflush(stderr);

689

}

690

assert((packed_entry - (struct index_entry *)mem)

691

== (total_num_entries + hsize * EXTRA_NULLS));

692

index->last_entry = (packed_entry - 1);

693

return index;

694

}

695

696

697

void

698

get_text(char buff[128], const unsigned char *ptr)

699

{

700

unsigned int i;

701

const unsigned char *start;

702

unsigned char cmd;

703

start = (ptr-RABIN_WINDOW-1);

704

cmd = *(start);

705

if (cmd < 0x80) {// This is likely to be an insert instruction

706

if (cmd < RABIN_WINDOW) {

707

cmd = RABIN_WINDOW;

708

}

709

} else {

710

/* This was either a copy [should never be] or it

711

* was a longer insert so the insert start happened at 16 more

712

* bytes back.

713

714

cmd = RABIN_WINDOW + 1;

715

}

716

if (cmd > 60) {

717

cmd = 60; /* Be friendly to 80char terms */

718

}

719

/* Copy the 1 byte command, and 4 bytes after the insert */

720

cmd += 5;

721

memcpy(buff, start, cmd);

722

buff[cmd] = 0;

723

for (i = 0; i < cmd; ++i) {

724

if (buff[i] == '\n') {

725

buff[i] = 'N';

726

} else if (buff[i] == '\t') {

727

buff[i] = 'T';

728

}

729

}

730

}

731

505

732

struct delta_index *

506

733

create_delta_index_from_delta(const struct source_info *src,

507

const struct delta_index *old)

734

struct delta_index *old_index)

508

735

{

509

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

510

unsigned int total_num_entries;

736

unsigned int i, num_entries, max_num_entries, prev_val, num_inserted;

737

unsigned int hash_offset;

511

738

const unsigned char *data, *buffer, *top;

512

739

unsigned char cmd;

513

struct delta_index *index;

514

struct unpacked_index_entry *entry, **hash;

515

void *mem;

516

unsigned long memsize;

740

struct delta_index *new_index;

741

struct index_entry *entry, *entries, *old_entry;

517

742

518

743

if (!src->buf || !src->size)

519

744

return NULL;

527

752

actual number will be recomputed during processing.

528

753

529

754

530

num_entries = (src->size - 1) / RABIN_WINDOW;

531

if (old != NULL)

532

total_num_entries = num_entries + old->num_entries;

533

else

534

total_num_entries = num_entries;

535

hsize = total_num_entries / 4;

536

for (i = 4; (1u << i) < hsize && i < 31; i++);

537

hsize = 1 << i;

538

hmask = hsize - 1;

539

540

/* allocate lookup index */

541

memsize = sizeof(*hash) * hsize +

542

sizeof(*entry) * total_num_entries;

543

mem = malloc(memsize);

544

if (!mem)

545

return NULL;

546

hash = mem;

547

mem = hash + hsize;

548

entry = mem;

549

550

memset(hash, 0, hsize * sizeof(*hash));

551

552

/* allocate an array to count hash num_entries */

553

hash_count = calloc(hsize, sizeof(*hash_count));

554

if (!hash_count) {

555

free(hash);

556

return NULL;

557

}

755

max_num_entries = (src->size - 1) / RABIN_WINDOW;

756

757

/* allocate an array to hold whatever entries we find */

758

entries = malloc(sizeof(*entry) * max_num_entries);

759

if (!entries) /* malloc failure */

760

return NULL;

558

761

559

762

/* then populate the index for the new data */

560

/* The create_delta_index code starts at the end and works backward,

561

* because it is easier to update the entry pointers (you always update the

562

* head, rather than updating the tail).

563

* However, we can't walk the delta structure that way.

564

565

763

prev_val = ~0;

566

764

data = buffer;

567

765

/* source size */

568

766

get_delta_hdr_size(&data, top);

569

767

/* target size */

570

768

get_delta_hdr_size(&data, top);

769

entry = entries; /* start at the first slot */

571

770

num_entries = 0; /* calculate the real number of entries */

572

771

while (data < top) {

573

772

cmd = *data++;

594

793

if (val != prev_val) {

595

794

/* Only keep the first of consecutive data */

596

795

prev_val = val;

597

i = val & hmask;

598

entry->entry.ptr = data + RABIN_WINDOW;

599

entry->entry.val = val;

600

entry->entry.src = src;

601

entry->next = NULL;

602

/* Now we have to insert this entry at the end of the hash

603

* map

604

605

if (hash[i] == NULL) {

606

hash[i] = entry;

607

} else {

608

struct unpacked_index_entry *this;

609

for (this = hash[i];

610

this->next != NULL;

611

this = this->next) { /* No action */ }

612

this->next = entry;

613

this = entry;

614

}

615

hash_count[i]++;

796

num_entries++;

797

entry->ptr = data + RABIN_WINDOW;

798

entry->val = val;

799

entry->src = src;

616

800

entry++;

617

num_entries++;

801

if (num_entries > max_num_entries) {

802

/* We ran out of entry room, something is really wrong

803

804

break;

805

}

618

806

}

619

807

}

620

808

/* Move the data pointer by whatever remainder is left */

630

818

}

631

819

if (data != top) {

632

820

/* Something was wrong with this delta */

633

free(hash);

634

free(hash_count);

821

free(entries);

635

822

return NULL;

636

823

}

637

824

if (num_entries == 0) {

638

825

/** Nothing to index **/

639

free(hash);

640

free(hash_count);

826

free(entries);

641

827

return NULL;

642

828

}

643

if (old != NULL) {

644

if (hmask == old->hash_mask) {

645

/* The total hash table size didn't change, which means that

646

* entries will end up in the same pages. We can do bulk-copying to

647

* get the final output

648

649

index = create_index_from_old_and_hash(old, hash, hsize,

650

num_entries);

651

free(hash_count);

652

free(hash);

653

if (!index) {

654

return NULL;

655

}

656

index->last_src = src;

657

return index;

658

} else {

659

total_num_entries = num_entries + old->num_entries;

660

include_entries_from_index(hash, hash_count, hsize, entry, old);

661

}

829

assert(old_index != NULL);

830

old_index->last_src = src;

831

/* See if we can fill in these values into the holes in the array */

832

entry = entries;

833

num_inserted = 0;

834

for (; num_entries > 0; --num_entries, ++entry) {

835

hash_offset = (entry->val & old_index->hash_mask);

836

/* The basic structure is a hash => packed_entries that fit in that

837

* hash bucket. Things are structured such that the hash-pointers are

838

* strictly ordered. So we start by pointing to the next pointer, and

839

* walk back until we stop getting NULL targets, and then go back

840

* forward. If there are no NULL targets, then we know because

841

* entry->ptr will not be NULL.

842

843

old_entry = old_index->hash[hash_offset + 1];

844

old_entry--;

845

while (old_entry->ptr == NULL

846

&& old_entry >= old_index->hash[hash_offset]) {

847

old_entry--;

848

}

849

old_entry++;

850

if (old_entry->ptr != NULL

851

|| old_entry >= old_index->hash[hash_offset + 1]) {

852

// char buff[128];

853

// get_text(buff, entry->ptr);

854

// fprintf(stderr, "Failed to find an opening @%x for %8x:\n '%s'\n",

855

// hash_offset, entry->val, buff);

856

// for (old_entry = old_index->hash[hash_offset];

857

// old_entry < old_index->hash[hash_offset+1];

858

// ++old_entry) {

859

// get_text(buff, old_entry->ptr);

860

// fprintf(stderr, " [%2d] %8x %8x: '%s'\n",

861

// (int)(old_entry - old_index->hash[hash_offset]),

862

// old_entry->val, old_entry->ptr, buff);

863

// }

864

break;

865

}

866

num_inserted++;

867

*old_entry = *entry;

868

/* For entries which we *do* manage to insert into old_index, we don't

869

* want them double copied into the final output.

870

871

old_index->num_entries++;

872

}

873

if (num_entries > 0) {

874

/* We couldn't fit the new entries into the old index, so allocate a

875

* new one, and fill it with stuff.

876

877

// fprintf(stderr, "inserted %d before resize\n", num_inserted);

878

new_index = create_index_from_old_and_new_entries(old_index,

879

entry, num_entries);

662

880

} else {

663

total_num_entries = num_entries;

664

}

665

666

total_num_entries = limit_hash_buckets(hash, hash_count, hsize,

667

total_num_entries);

668

free(hash_count);

669

index = pack_delta_index(hash, hsize, total_num_entries);

670

free(hash);

671

if (!index) {

672

return NULL;

673

}

674

index->last_src = src;

675

return index;

881

new_index = NULL;

882

// fprintf(stderr, "inserted %d without resizing\n", num_inserted);

883

}

884

free(entries);

885

return new_index;

676

886

}

677

887

678

888

void free_delta_index(struct delta_index *index)

Older »