~bzr-pqm/bzr/bzr.dev

Viewing changes to bzrlib/_btree_serializer_pyx.pyx

Committer: John Arbash Meinel
Date: 2010-08-03 20:56:39 UTC
mto: This revision was merged to the branch mainline in revision 5390.
Revision ID: john@arbash-meinel.com-20100803205639-k23colcozyd14440

Implement a custom parser for chk btree leaves.

The basic parsing is actually quicker (47us to build this new structure vs 110us)
Most likely because we have 1 malloc rather than N per leaf.
Memory size is also much smaller. We'll lose a little bit if we
convert back and forth from keys, etc.

files modified:
bzrlib/_btree_serializer_pyx.pyx

bzrlib/btree_index.py

bzrlib/python-compat.h

bzrlib/tests/test_btree_index.py

Show diffs side-by-side

added added

removed removed

bzrlib/_btree_serializer_pyx.pyx

char *PyString_AsString(object p) except NULL

object PyString_FromStringAndSize(char *, Py_ssize_t)

PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)

object PyString_FromFormat(char *, ...)

int PyString_CheckExact(object s)

int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)

Py_ssize_t PyString_Size(object p)

PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)

void Py_INCREF(object)

void Py_DECREF_ptr "Py_DECREF" (PyObject *)

void *PyMem_Malloc(size_t nbytes)

void PyMem_Free(void *)

void memset(void *, int, size_t)

cdef extern from "string.h":

void *memcpy(void *dest, void *src, size_t n)

void *memchr(void *s, int c, size_t n)

int memcmp(void *s1, void *s2, size_t n)

# GNU extension

# void *memrchr(void *s, int c, size_t n)

int strncmp(char *s1, char *s2, size_t n)

unsigned long strtoul(char *s1, char **out, int base)

unsigned long long strtoull(char *s1, char **out, int base)

# It seems we need to import the definitions so that the pyrex compiler has

# local names to access them.

315

322

return parser.parse()

316

323

317

324

325

ctypedef struct gc_chk_sha1_record:

326

unsigned long long block_offset

327

unsigned int block_length

328

unsigned int record_start

329

unsigned int record_end

330

char sha1[20]

331

332

333

cdef int _unhexbuf[256]

334

cdef char *_hexbuf = '01234567890abcdef'

335

336

cdef _populate_unhexbuf():

337

cdef unsigned char a

338

for a from 0 <= a < 255:

339

_unhexbuf[a] = -1

340

for a in '0123456789':

341

_unhexbuf[a] = a - c'0'

342

for a in 'abcdef':

343

_unhexbuf[a] = a - c'a' + 10

344

for a in 'ABCDEF':

345

_unhexbuf[a] = a - c'A' + 10

346

347

348

cdef int _unhexlify_sha1(char *as_hex, char *as_bin):

349

"""Take the hex sha1 in as_hex and make it binary in as_bin"""

350

cdef int top

351

cdef int bot

352

cdef int i

353

cdef char *cur

354

355

cur = as_hex

356

for i from 0 <= i < 20:

357

top = _unhexbuf[<unsigned char>(cur)]

358

cur += 1

359

bot = _unhexbuf[<unsigned char>(cur)]

360

cur += 1

361

if top == -1 or bot == -1:

362

return 0

363

as_bin[i] = (top << 4) + bot;

364

return 1

365

366

367

cdef void _hexlify_sha1(char *as_bin, char *as_hex):

368

cdef int i, j

369

cdef char c

370

371

j = 0

372

for i from 0 <= i < 20:

373

c = as_bin[i]

374

as_hex[j] = _hexbuf[(c>>4)&0xf]

375

j += 1

376

as_hex[j] = _hexbuf[(c)&0xf]

377

j += 1

378

379

380

cdef class GCCHKSHA1LeafNode:

381

"""Track all the entries for a given leaf node."""

382

383

cdef public int num_entries

384

cdef gc_chk_sha1_record *entries

385

# TODO: Consider what a mini-index might look like. Something that could

386

# let us quickly jump to a subset range, rather than doing pure

387

# bisect all the time.

388

389

def __sizeof__(self):

390

return (sizeof(GCCHKSHA1LeafNode)

391

+ sizeof(gc_chk_sha1_record)*self.num_entries)

392

393

def __dealloc__(self):

394

if self.entries != NULL:

395

PyMem_Free(self.entries)

396

self.entries = NULL

397

398

def __init__(self, bytes):

399

self._parse_bytes(bytes)

400

401

property min_key:

402

def __get__(self):

403

pass

404

405

property max_key:

406

def __get__(self):

407

pass

408

409

cdef int _key_to_sha1(self, key, char *sha1):

410

"""Map a key into its sha1 content.

411

412

:param key: A tuple of style ('sha1:abcd...',)

413

:param sha1: A char buffer of 20 bytes

414

:return: 1 if this could be converted, 0 otherwise

415

"""

416

cdef char *c_val

417

if not PyTuple_CheckExact(key) and not StaticTuple_CheckExact(key):

418

return 0

419

#? raise TypeError('Keys must be a tuple or StaticTuple')

420

if len(key) != 1:

421

return 0

422

val = key[0]

423

if not PyString_CheckExact(val) or PyString_GET_SIZE(val) != 45:

424

return 0

425

c_val = PyString_AS_STRING(val)

426

if not strncmp(c_val, 'sha1:', 5):

427

return 0

428

if not _unhexlify_sha1(c_val + 5, sha1):

429

return 0

430

return 1

431

432

cdef StaticTuple _sha1_to_key(self, char *sha1):

433

"""Compute a ('sha1:abcd',) key for a given sha1."""

434

cdef StaticTuple key

435

cdef object hexxed

436

cdef char *c_buf

437

hexxed = PyString_FromStringAndSize(NULL, 45)

438

c_buf = PyString_AS_STRING(hexxed)

439

memcpy(c_buf, 'sha1:', 5)

440

_hexlify_sha1(sha1, c_buf+5)

441

key = StaticTuple_New(1)

442

Py_INCREF(hexxed)

443

StaticTuple_SET_ITEM(key, 0, hexxed)

444

return key

445

446

cdef StaticTuple _record_to_value_and_refs(self,

447

gc_chk_sha1_record *record):

448

"""Extract the refs and value part of this record."""

449

cdef StaticTuple value_and_refs

450

cdef StaticTuple empty

451

value_and_refs = StaticTuple_New(2)

452

# This is really inefficient to go from a logical state back to a

453

# string, but it makes things work a bit better internally for now.

454

value = PyString_FromFormat('%llu %lu %lu %lu',

455

record.block_offset, record.block_length,

456

record.record_start, record.record_end)

457

Py_INCREF(value)

458

StaticTuple_SET_ITEM(value_and_refs, 0, value)

459

# Always empty refs

460

empty = StaticTuple_New(0)

461

Py_INCREF(empty)

462

StaticTuple_SET_ITEM(value_and_refs, 1, empty)

463

return value_and_refs

464

465

cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):

466

"""Turn a given record back into a fully fledged item.

467

"""

468

cdef StaticTuple item

469

cdef StaticTuple key

470

cdef StaticTuple value_and_refs

471

cdef object value

472

key = self._sha1_to_key(record.sha1)

473

item = StaticTuple_New(2)

474

Py_INCREF(key)

475

StaticTuple_SET_ITEM(item, 0, key)

476

value_and_refs = self._record_to_value_and_refs(record)

477

Py_INCREF(value_and_refs)

478

StaticTuple_SET_ITEM(item, 1, value_and_refs)

479

return item

480

481

cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):

482

"""Find a gc_chk_sha1_record that matches the sha1 supplied."""

483

# For right now we iterate, in the future we should bisect, or create

484

# a local index, or use the sha1 as a hash into a local table, etc.

485

cdef int i

486

for i from 0 <= i < self.num_entries:

487

if memcmp(self.entries[i].sha1, sha1, 20) == 0:

488

return &self.entries[i]

489

return NULL

490

491

def __contains__(self, key):

492

cdef char sha1[20]

493

cdef gc_chk_sha1_record *record

494

if not self._key_to_sha1(key, sha1):

495

# If it isn't a sha1 key, then it won't be in this leaf node

496

return False

497

return self._lookup_record(sha1) != NULL

498

499

def __getitem__(self, key):

500

cdef char sha1[20]

501

cdef gc_chk_sha1_record *record = NULL

502

if self._key_to_sha1(key, sha1):

503

record = self._lookup_record(sha1)

504

if record == NULL:

505

raise KeyError('key %r is not present' % (key,))

506

return self._record_to_value_and_refs(record)

507

508

def __len__(self):

509

return self.num_entries

510

511

def all_keys(self):

512

cdef int i

513

cdef list result

514

for i from 0 <= i < self.num_entries:

515

result.append(self._sha1_to_key(self.entries[i].sha1))

516

return result

517

518

def all_items(self):

519

cdef int i

520

cdef list result

521

for i from 0 <= i < self.num_entries:

522

result.append(self._record_to_item(self.entries + i))

523

return result

524

525

cdef _parse_bytes(self, bytes):

526

"""Parse the string 'bytes' into content."""

527

cdef char *c_bytes

528

cdef char *c_content

529

cdef char *c_cur

530

cdef char *c_end

531

cdef char *c_next

532

cdef char *c_sha1_prefix

533

cdef void *buf

534

cdef Py_ssize_t n_bytes

535

cdef int num_entries

536

cdef int entry

537

cdef int interesting_char

538

cdef int i

539

cdef gc_chk_sha1_record *cur_record

540

541

if not PyString_CheckExact(bytes):

542

raise TypeError('We only support parsing plain 8-bit strings.')

543

# Pass 1, count how many entries there will be

544

n_bytes = PyString_GET_SIZE(bytes)

545

c_bytes = PyString_AS_STRING(bytes)

546

c_end = c_bytes + n_bytes

547

if strncmp(c_bytes, 'type=leaf\n', 10):

548

raise ValueError("bytes did not start with 'type=leaf\\n': %r"

549

% (bytes[:10],))

550

c_content = c_bytes + 10

551

c_cur = c_content

552

num_entries = 0

553

while c_cur != NULL and c_cur < c_end:

554

c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);

555

if c_cur == NULL:

556

break

557

c_cur += 1

558

num_entries += 1

559

# Now allocate the memory for these items, and go to town

560

# We allocate both the offsets and the entries in the same malloc. we

561

# should probably pay a bit closer attention to alignment

562

buf = PyMem_Malloc(num_entries *

563

(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))

564

self.entries = <gc_chk_sha1_record*>buf

565

self.num_entries = num_entries

566

c_cur = c_content

567

cur_record = self.entries

568

entry = 0

569

interesting_char = -1

570

while c_cur != NULL and c_cur < c_end and entry < num_entries:

571

if strncmp(c_cur, 'sha1:', 5):

572

raise ValueError('At byte %d, line did not start with sha1: %r'

573

% (c_cur - c_bytes, safe_string_from_size(c_cur, 10)))

574

c_cur += 5

575

c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)

576

if c_next == NULL or (c_next - c_cur != 40):

577

raise ValueError('Line did not contain 40 hex bytes')

578

if not _unhexlify_sha1(c_cur, cur_record.sha1):

579

raise ValueError('We failed to unhexlify')

580

if interesting_char == -1:

581

interesting_char = 20

582

c_sha1_prefix = cur_record.sha1

583

elif interesting_char > 0:

584

for i from 0 <= i < interesting_char:

585

if cur_record.sha1[i] != c_sha1_prefix[i]:

586

interesting_char = i

587

break

588

c_cur = c_next + 1

589

if c_cur[0] != c'\0':

590

raise ValueError('only 1 null, not 2 as expected')

591

c_cur += 1

592

cur_record.block_offset = strtoull(c_cur, &c_next, 10)

593

if c_cur == c_next or c_next[0] != c' ':

594

raise ValueError('Failed to parse block offset')

595

c_cur = c_next + 1

596

cur_record.block_length = strtoul(c_cur, &c_next, 10)

597

if c_cur == c_next or c_next[0] != c' ':

598

raise ValueError('Failed to parse block length')

599

c_cur = c_next + 1

600

cur_record.record_start = strtoul(c_cur, &c_next, 10)

601

if c_cur == c_next or c_next[0] != c' ':

602

raise ValueError('Failed to parse block length')

603

c_cur = c_next + 1

604

cur_record.record_end = strtoul(c_cur, &c_next, 10)

605

if c_cur == c_next or c_next[0] != c'\n':

606

raise ValueError('Failed to parse record end')

607

c_cur = c_next + 1

608

# Pass 3: build the offset map

609

# The idea with the offset map is that we should be able to quickly

610

# jump to the key that matches a gives sha1. We know that the keys are

611

# in sorted order, and we know that a lot of the prefix is going to be

612

# the same across them.

613

614

615

616

617

def _parse_into_chk(bytes, key_length, ref_list_length):

618

"""Parse into a format optimized for chk records."""

619

assert key_length == 1

620

assert ref_list_length == 0

621

return GCCHKSHA1LeafNode(bytes)

622

623

318

624

def _flatten_node(node, reference_lists):

319

625

"""Convert a node into the serialized form.

320

626

Older »