~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2011-05-12 11:38:39 UTC
mto: (5755.2.10 2.4-max-entries-gc-602614)
mto: This revision was merged to the branch mainline in revision 5870.
Revision ID: john@arbash-meinel.com-20110512113839-reasdnirl4889ptb

Add a max_entries_per_source to DeltaIndex

This changes the sampling rate in the create_delta_from_source.
This isn't exposed higher up yet, but it work so far.

files modified:
bzrlib/_groupcompress_pyx.pyx

bzrlib/delta.h

bzrlib/diff-delta.c

bzrlib/tests/test__groupcompress.py

Show diffs side-by-side

added added

removed removed

bzrlib/diff-delta.c

#define RABIN_SHIFT 23

#define RABIN_WINDOW 16

#define MAX_NUM_ENTRIES 10000

/* The hash map is sized to put 4 entries per bucket, this gives us ~even room

* for more data. Tweaking this number above 4 doesn't seem to help much,

377

376

delta_result

378

377

create_delta_index(const struct source_info *src,

379

378

struct delta_index *old,

380

struct delta_index **fresh)

379

struct delta_index **fresh,

380

int max_entries)

381

{

382

unsigned int i, hsize, hmask, num_entries, prev_val, *hash_count;

383

unsigned int total_num_entries, stride;

392

buffer = src->buf;

393

394

/* Determine index hash size. Note that indexing skips the

395

first byte to allow for optimizing the Rabin's polynomial

396

initialization in create_delta(). */

395

first byte so we subtract 1 to get the edge cases right.

396

397

stride = RABIN_WINDOW;

398

num_entries = (src->size - 1) / RABIN_WINDOW;

399

if (num_entries > MAX_NUM_ENTRIES) {

399

if (max_entries > 0 && num_entries > max_entries) {

400

/* Limit the max number of matching entries. This reduces the 'best'

401

* possible match, but means we don't consume all of ram.

402

403

// fprintf(stderr, "limiting num_entries to %d\n", MAX_NUM_ENTRIES);

404

num_entries = MAX_NUM_ENTRIES;

405

stride = (src->size) / num_entries;

403

num_entries = max_entries;

404

stride = (src->size - 1) / num_entries;

406

405

}

407

406

if (old != NULL)

408

407

total_num_entries = num_entries + old->num_entries;

486

485

unsigned int hsize)

487

486

{

488

487

unsigned int hash_offset, hmask, memsize;

489

struct index_entry *entry, *last_entry;

488

struct index_entry *entry;

490

489

struct index_entry_linked_list *out_entry, **hash;

491

490

void *mem;

492

491

506

505

/* We know that entries are in the order we want in the output, but they

507

506

* aren't "grouped" by hash bucket yet.

508

507

509

last_entry = entries + num_entries;

510

508

for (entry = entries + num_entries - 1; entry >= entries; --entry) {

511

509

hash_offset = entry->val & hmask;

512

510

out_entry->p_entry = entry;

531

529

unsigned int i, j, hsize, hmask, total_num_entries;

532

530

struct delta_index *index;

533

531

struct index_entry *entry, *packed_entry, **packed_hash;

534

struct index_entry *last_entry, null_entry = {0};

532

struct index_entry null_entry = {0};

535

533

void *mem;

536

534

unsigned long memsize;

537

535

struct index_entry_linked_list *unpacked_entry, **mini_hash;

582

580

free(index);

583

581

return NULL;

584

582

}

585

last_entry = entries + num_entries;

586

583

for (i = 0; i < hsize; i++) {

587

584

588

585

* Coalesce all entries belonging in one hash bucket

1110

1107

return DELTA_OK;

1111

1108

}

1112

1109

1110

1111

int

1112

get_entry_summary(const struct delta_index *index, int pos,

1113

unsigned int *text_offset, unsigned int *hash_val)

1114

{

1115

int hsize;

1116

const struct index_entry *entry;

1117

const struct index_entry *start_of_entries;

1118

unsigned int offset;

1119

if (pos < 0 || text_offset == NULL || hash_val == NULL

1120

|| index == NULL)

1121

{

1122

return 0;

1123

}

1124

hsize = index->hash_mask + 1;

1125

start_of_entries = (struct index_entry *)(((struct index_entry **)index->hash) + (hsize + 1));

1126

entry = start_of_entries + pos;

1127

if (entry > index->last_entry) {

1128

return 0;

1129

}

1130

if (entry->ptr == NULL) {

1131

*text_offset = 0;

1132

*hash_val = 0;

1133

} else {

1134

offset = entry->src->agg_offset;

1135

offset += (entry->ptr - ((unsigned char *)entry->src->buf));

1136

*text_offset = offset;

1137

*hash_val = entry->val;

1138

}

1139

return 1;

1140

}

1141

1142

1143

int

1144

get_hash_offset(const struct delta_index *index, int pos,

1145

unsigned int *entry_offset)

1146

{

1147

int hsize;

1148

const struct index_entry *entry;

1149

const struct index_entry *start_of_entries;

1150

if (pos < 0 || index == NULL || entry_offset == NULL)

1151

{

1152

return 0;

1153

}

1154

hsize = index->hash_mask + 1;

1155

start_of_entries = (struct index_entry *)(((struct index_entry **)index->hash) + (hsize + 1));

1156

if (pos >= hsize) {

1157

return 0;

1158

}

1159

entry = index->hash[pos];

1160

if (entry == NULL) {

1161

*entry_offset = -1;

1162

} else {

1163

*entry_offset = (entry - start_of_entries);

1164

}

1165

return 1;

1166

}

1167

1168

1169

unsigned int

1170

rabin_hash(const unsigned char *data)

1171

{

1172

int i;

1173

unsigned int val = 0;

1174

for (i = 0; i < RABIN_WINDOW; i++)

1175

val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];

1176

return val;

1177

}

1178

1113

1179

/* vim: et ts=4 sw=4 sts=4

1114

1180

Older »