~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/_btree_serializer_pyx.pyx

Committer: Martin Pool
Date: 2010-07-21 09:58:42 UTC
mfrom: (4797.58.7 2.1)
mto: (5050.3.13 2.2)
mto: This revision was merged to the branch mainline in revision 5365.
Revision ID: mbp@canonical.com-20100721095842-hz0obu8gl0x05nty

merge up 2.1 to 2.2

files removed:
bzrlib/controldir.py

bzrlib/doc_generate/builders

bzrlib/doc_generate/builders/__init__.py

bzrlib/doc_generate/builders/texinfo.py

bzrlib/doc_generate/writers

bzrlib/doc_generate/writers/__init__.py

bzrlib/doc_generate/writers/texinfo.py

bzrlib/tests/doc_generate

bzrlib/tests/doc_generate/__init__.py

bzrlib/tests/doc_generate/builders

bzrlib/tests/doc_generate/builders/__init__.py

bzrlib/tests/doc_generate/builders/test_texinfo.py

bzrlib/tests/doc_generate/writers

bzrlib/tests/doc_generate/writers/__init__.py

bzrlib/tests/doc_generate/writers/test_texinfo.py

bzrlib/tests/test__btree_serializer.py

bzrlib/tests/test_test_server.py

bzrlib/tests/test_versionedfile.py

doc/developers/transports.txt

doc/en/whats-new/whats-new-in-2.3.txt

tools/win32/py2exe_boot_common.py

files renamed:
bzrlib/doc_generate/conf.py => bzrlib/doc_generate/sphinx_conf.py

bzrlib/tests/per_controldir/ => bzrlib/tests/per_bzrdir/

bzrlib/tests/per_controldir/test_controldir.py => bzrlib/tests/per_bzrdir/test_bzrdir.py

bzrlib/tests/per_controldir_colo/ => bzrlib/tests/per_bzrdir_colo/

files modified:
Makefile

NEWS

bzrlib/__init__.py

bzrlib/_btree_serializer_pyx.pyx

bzrlib/_dirstate_helpers_pyx.pyx

bzrlib/_groupcompress_pyx.pyx

bzrlib/_simple_set_pyx.pyx

bzrlib/_static_tuple_c.pxd

bzrlib/branch.py

bzrlib/btree_index.py

bzrlib/builtins.py

bzrlib/bundle/__init__.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v4.py

bzrlib/bzrdir.py

bzrlib/clean_tree.py

bzrlib/commands.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/diff-delta.c

bzrlib/diff.py

bzrlib/dirstate.py

bzrlib/doc_generate/autodoc_man.py

bzrlib/errors.py

bzrlib/generate_ids.py

bzrlib/globbing.py

bzrlib/groupcompress.py

bzrlib/hashcache.py

bzrlib/help_topics/__init__.py

bzrlib/help_topics/en/conflict-types.txt

bzrlib/inventory.py

bzrlib/log.py

bzrlib/lru_cache.py

bzrlib/merge.py

bzrlib/multiparent.py

bzrlib/mutabletree.py

bzrlib/osutils.py

bzrlib/plugins/launchpad/lp_directory.py

bzrlib/plugins/launchpad/test_account.py

bzrlib/plugins/launchpad/test_lp_directory.py

bzrlib/python-compat.h

bzrlib/reconcile.py

bzrlib/remote.py

bzrlib/repofmt/groupcompress_repo.py

bzrlib/repofmt/pack_repo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/shelf_ui.py

bzrlib/smart/bzrdir.py

bzrlib/smart/server.py

bzrlib/store/text.py

bzrlib/symbol_versioning.py

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_alias.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/features.py

bzrlib/tests/ftp_server/medusa_based.py

bzrlib/tests/ftp_server/pyftpdlib_based.py

bzrlib/tests/http_server.py

bzrlib/tests/http_utils.py

bzrlib/tests/https_server.py

bzrlib/tests/per_branch/__init__.py

bzrlib/tests/per_branch/test_branch.py

bzrlib/tests/per_branch/test_push.py

bzrlib/tests/per_bzrdir/__init__.py

bzrlib/tests/per_bzrdir/test_push.py

bzrlib/tests/per_bzrdir_colo/__init__.py

bzrlib/tests/per_bzrdir_colo/test_supported.py

bzrlib/tests/per_bzrdir_colo/test_unsupported.py

bzrlib/tests/per_interrepository/__init__.py

bzrlib/tests/per_repository/__init__.py

bzrlib/tests/per_repository/test_commit_builder.py

bzrlib/tests/per_transport.py

bzrlib/tests/per_tree/__init__.py

bzrlib/tests/per_workingtree/__init__.py

bzrlib/tests/per_workingtree/test_is_ignored.py

bzrlib/tests/per_workingtree/test_move.py

bzrlib/tests/per_workingtree/test_remove.py

bzrlib/tests/per_workingtree/test_symlinks.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test__simple_set.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_btree_index.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_clean_tree.py

bzrlib/tests/test_commands.py

bzrlib/tests/test_config.py

bzrlib/tests/test_debug.py

bzrlib/tests/test_dirstate.py

bzrlib/tests/test_foreign.py

bzrlib/tests/test_globbing.py

bzrlib/tests/test_groupcompress.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_inventory_delta.py

bzrlib/tests/test_lru_cache.py

bzrlib/tests/test_lsprof.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_options.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_server.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_request.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_smtp_connection.py

bzrlib/tests/test_source.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp/__init__.py

bzrlib/transport/ftp/_gssapi.py

bzrlib/transport/gio_transport.py

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/remote.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tuned_gzip.py

bzrlib/ui/text.py

bzrlib/version_info_formats/__init__.py

bzrlib/versionedfile.py

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

doc/developers/HACKING.txt

doc/developers/case-insensitive-file-systems.txt

doc/developers/conf.py

doc/developers/content-filtering.txt

doc/developers/development-repo.txt

doc/developers/index-plain.txt

doc/developers/index.txt

doc/developers/overview.txt

doc/developers/packrepo.txt

doc/developers/planned-performance-changes.txt

doc/developers/ppa.txt

doc/developers/releasing.txt

doc/developers/testing.txt

doc/en/Makefile

doc/en/_templates/index.html

doc/en/admin-guide/code-browsing.txt

doc/en/admin-guide/hooks-plugins.txt

doc/en/admin-guide/migration.txt

doc/en/admin-guide/upgrade.txt

doc/en/conf.py

doc/en/index.txt

doc/en/mini-tutorial/index.txt

doc/en/tutorials/centralized_workflow.txt

doc/en/tutorials/tutorial.txt

doc/en/tutorials/using_bazaar_with_launchpad.txt

doc/en/upgrade-guide/overview.txt

doc/en/user-guide/bzrtools_plugin.txt

doc/en/user-guide/hooks.txt

doc/en/user-guide/installing_bazaar.txt

doc/en/user-guide/introducing_bazaar.txt

doc/en/user-guide/plugins.txt

doc/en/user-guide/server.txt

doc/en/user-guide/shared_repository_layouts.txt

doc/en/user-guide/svn_plugin.txt

doc/en/user-guide/web_browsing.txt

doc/en/user-reference/readme.txt

doc/en/whats-new/whats-new-in-2.2.txt

doc/es/conf.py

doc/es/index.txt

doc/es/mini-tutorial/index.txt

doc/index.es.txt

doc/index.ja.txt

doc/index.ru.txt

doc/index.txt

doc/ja/conf.py

doc/ja/index.txt

doc/ja/mini-tutorial/index.txt

doc/ja/tutorials/centralized_workflow.txt

doc/ja/tutorials/tutorial.txt

doc/ja/tutorials/using_bazaar_with_launchpad.txt

doc/ja/upgrade-guide/overview.txt

doc/ja/user-guide/bzrtools_plugin.txt

doc/ja/user-guide/installing_bazaar.txt

doc/ja/user-guide/introducing_bazaar.txt

doc/ja/user-guide/plugins.txt

doc/ja/user-guide/server.txt

doc/ja/user-guide/shared_repository_layouts.txt

doc/ja/user-guide/svn_plugin.txt

doc/ja/user-guide/web_browsing.txt

doc/ja/user-reference/index.txt

doc/ru/conf.py

doc/ru/index.txt

doc/ru/mini-tutorial/index.txt

doc/ru/tutorials/centralized_workflow.txt

doc/ru/tutorials/tutorial.txt

doc/ru/tutorials/using_bazaar_with_launchpad.txt

doc/ru/user-guide/introducing_bazaar.txt

setup.py

tools/generate_release_notes.py

tools/packaging/update-changelogs.sh

tools/packaging/update-packaging-branches.sh

Show diffs side-by-side

added added

removed removed

bzrlib/_btree_serializer_pyx.pyx

char *PyString_AsString(object p) except NULL

object PyString_FromStringAndSize(char *, Py_ssize_t)

PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)

object PyString_FromFormat(char *, ...)

int PyString_CheckExact(object s)

int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)

Py_ssize_t PyString_Size(object p)

PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)

void Py_INCREF(object)

void Py_DECREF_ptr "Py_DECREF" (PyObject *)

void *PyMem_Malloc(size_t nbytes)

void PyMem_Free(void *)

void memset(void *, int, size_t)

cdef extern from "string.h":

void *memcpy(void *dest, void *src, size_t n)

void *memchr(void *s, int c, size_t n)

int memcmp(void *s1, void *s2, size_t n)

# GNU extension

# void *memrchr(void *s, int c, size_t n)

int strncmp(char *s1, char *s2, size_t n)

unsigned long strtoul(char *s1, char **out, int base)

long long strtoll(char *s1, char **out, int base)

# It seems we need to import the definitions so that the pyrex compiler has

# local names to access them.

from _static_tuple_c cimport StaticTuple, \

import_static_tuple_c, StaticTuple_New, \

StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \

StaticTuple_GET_SIZE, StaticTuple_GET_ITEM

# This tells the test infrastructure that StaticTuple is a class, so we don't

# have to worry about exception checking.

## extern cdef class StaticTuple

import sys

StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact

# TODO: Find some way to import this from _dirstate_helpers

116

103

Py_DECREF_ptr(py_str)

117

104

return result

118

105

106

from bzrlib import _static_tuple_c

119

107

# This sets up the StaticTuple C_API functionality

120

108

import_static_tuple_c()

121

109

327

315

return parser.parse()

328

316

329

317

330

# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,

331

# because the block_offset + length is likely to be repeated. However,

332

# the big win there is to cache across pages, and not just one page

333

# Though if we did cache in a page, we could certainly use a short int.

334

# And this goes from 40 bytes to 30 bytes.

335

# One slightly ugly option would be to cache block offsets in a global.

336

# However, that leads to thread-safety issues, etc.

337

ctypedef struct gc_chk_sha1_record:

338

long long block_offset

339

unsigned int block_length

340

unsigned int record_start

341

unsigned int record_end

342

char sha1[20]

343

344

345

cdef int _unhexbuf[256]

346

cdef char *_hexbuf

347

_hexbuf = '0123456789abcdef'

348

349

cdef _populate_unhexbuf():

350

cdef int i

351

for i from 0 <= i < 256:

352

_unhexbuf[i] = -1

353

for i from 0 <= i < 10: # 0123456789 => map to the raw number

354

_unhexbuf[(i + c'0')] = i

355

for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16

356

_unhexbuf[(i - 10 + c'a')] = i

357

for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16

358

_unhexbuf[(i - 10 + c'A')] = i

359

_populate_unhexbuf()

360

361

362

cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise

363

"""Take the hex sha1 in as_hex and make it binary in as_bin

364

365

Same as binascii.unhexlify, but working on C strings, not Python objects.

366

"""

367

cdef int top

368

cdef int bot

369

cdef int i, j

370

cdef char *cur

371

372

# binascii does this using isupper() and tolower() and ?: syntax. I'm

373

# guessing a simple lookup array should be faster.

374

j = 0

375

for i from 0 <= i < 20:

376

top = _unhexbuf[<unsigned char>(as_hex[j])]

377

j = j + 1

378

bot = _unhexbuf[<unsigned char>(as_hex[j])]

379

j = j + 1

380

if top == -1 or bot == -1:

381

return 0

382

as_bin[i] = <unsigned char>((top << 4) + bot);

383

return 1

384

385

386

def _py_unhexlify(as_hex):

387

"""For the test infrastructure, just thunks to _unhexlify_sha1"""

388

if len(as_hex) != 40 or not PyString_CheckExact(as_hex):

389

raise ValueError('not a 40-byte hex digest')

390

as_bin = PyString_FromStringAndSize(NULL, 20)

391

if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)):

392

return as_bin

393

return None

394

395

396

cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise

397

cdef int i, j

398

cdef char c

399

400

j = 0

401

for i from 0 <= i < 20:

402

c = as_bin[i]

403

as_hex[j] = _hexbuf[(c>>4)&0xf]

404

j = j + 1

405

as_hex[j] = _hexbuf[(c)&0xf]

406

j = j + 1

407

408

409

def _py_hexlify(as_bin):

410

"""For test infrastructure, thunk to _hexlify_sha1"""

411

if len(as_bin) != 20 or not PyString_CheckExact(as_bin):

412

raise ValueError('not a 20-byte binary digest')

413

as_hex = PyString_FromStringAndSize(NULL, 40)

414

_hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex))

415

return as_hex

416

417

418

cdef int _key_to_sha1(key, char *sha1): # cannot_raise

419

"""Map a key into its sha1 content.

420

421

:param key: A tuple of style ('sha1:abcd...',)

422

:param sha1: A char buffer of 20 bytes

423

:return: 1 if this could be converted, 0 otherwise

424

"""

425

cdef char *c_val

426

cdef PyObject *p_val

427

428

if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:

429

p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)

430

elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1):

431

p_val = PyTuple_GET_ITEM_ptr_object(key, 0)

432

else:

433

# Not a tuple or a StaticTuple

434

return 0

435

if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45):

436

c_val = PyString_AS_STRING_ptr(p_val)

437

else:

438

return 0

439

if strncmp(c_val, 'sha1:', 5) != 0:

440

return 0

441

if not _unhexlify_sha1(c_val + 5, sha1):

442

return 0

443

return 1

444

445

446

def _py_key_to_sha1(key):

447

"""Map a key to a simple sha1 string.

448

449

This is a testing thunk to the C function.

450

"""

451

as_bin_sha = PyString_FromStringAndSize(NULL, 20)

452

if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):

453

return as_bin_sha

454

return None

455

456

457

cdef StaticTuple _sha1_to_key(char *sha1):

458

"""Compute a ('sha1:abcd',) key for a given sha1."""

459

cdef StaticTuple key

460

cdef object hexxed

461

cdef char *c_buf

462

hexxed = PyString_FromStringAndSize(NULL, 45)

463

c_buf = PyString_AS_STRING(hexxed)

464

memcpy(c_buf, 'sha1:', 5)

465

_hexlify_sha1(sha1, c_buf+5)

466

key = StaticTuple_New(1)

467

Py_INCREF(hexxed)

468

StaticTuple_SET_ITEM(key, 0, hexxed)

469

# This is a bit expensive. To parse 120 keys takes 48us, to return them all

470

# can be done in 66.6us (so 18.6us to build them all).

471

# Adding simple hash() here brings it to 76.6us (so computing the hash

472

# value of 120keys is 10us), Intern is 86.9us (another 10us to look and add

473

# them to the intern structure.)

474

# However, since we only intern keys that are in active use, it is probably

475

# a win. Since they would have been read from elsewhere anyway.

476

# We *could* hang the PyObject form off of the gc_chk_sha1_record for ones

477

# that we have deserialized. Something to think about, at least.

478

key = StaticTuple_Intern(key)

479

return key

480

481

482

def _py_sha1_to_key(sha1_bin):

483

"""Test thunk to check the sha1 mapping."""

484

if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:

485

raise ValueError('sha1_bin must be a str of exactly 20 bytes')

486

return _sha1_to_key(PyString_AS_STRING(sha1_bin))

487

488

489

cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise

490

cdef unsigned int val

491

# Must be in MSB, because that is how the content is sorted

492

val = (((<unsigned int>(sha1[0]) & 0xff) << 24)

493

| ((<unsigned int>(sha1[1]) & 0xff) << 16)

494

| ((<unsigned int>(sha1[2]) & 0xff) << 8)

495

| ((<unsigned int>(sha1[3]) & 0xff) << 0))

496

return val

497

498

499

cdef _format_record_py24(gc_chk_sha1_record *record):

500

"""Python2.4 PyString_FromFormat doesn't have %u.

501

502

It only has %d and %ld. We would really like to even have %llu, which

503

is only in python2.7. So we go back into casting to regular objects.

504

"""

505

return "%s %s %s %s" % (record.block_offset, record.block_length,

506

record.record_start, record.record_end)

507

508

509

cdef _format_record(gc_chk_sha1_record *record):

510

# This is inefficient to go from a logical state back to a

511

# string, but it makes things work a bit better internally for now.

512

if record.block_offset >= 0xFFFFFFFF:

513

# %llu is what we really want, but unfortunately it was only added

514

# in python 2.7... :(

515

block_offset_str = str(record.block_offset)

516

value = PyString_FromFormat('%s %lu %lu %lu',

517

PyString_AS_STRING(block_offset_str),

518

record.block_length,

519

record.record_start, record.record_end)

520

else:

521

value = PyString_FromFormat('%lu %lu %lu %lu',

522

<unsigned long>record.block_offset,

523

record.block_length,

524

record.record_start, record.record_end)

525

return value

526

527

ctypedef object (*formatproc)(gc_chk_sha1_record *)

528

cdef formatproc _record_formatter

529

_record_formatter = _format_record

530

if sys.version_info[:2] == (2, 4):

531

_record_formatter = _format_record_py24

532

533

534

cdef class GCCHKSHA1LeafNode:

535

"""Track all the entries for a given leaf node."""

536

537

cdef gc_chk_sha1_record *records

538

cdef public object last_key

539

cdef gc_chk_sha1_record *last_record

540

cdef public int num_records

541

# This is the number of bits to shift to get to the interesting byte. A

542

# value of 24 means that the very first byte changes across all keys.

543

# Anything else means that there is a common prefix of bits that we can

544

# ignore. 0 means that at least the first 3 bytes are identical, though

545

# that is going to be very rare

546

cdef public unsigned char common_shift

547

# This maps an interesting byte to the first record that matches.

548

# Equivalent to bisect.bisect_left(self.records, sha1), though only taking

549

# into account that one byte.

550

cdef unsigned char offsets[257]

551

552

def __sizeof__(self):

553

# :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)

554

# like Cython? Explicitly enumerating everything here seems to leave my

555

# size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an

556

# alignment/padding issue. Oh well- at least we scale properly with

557

# num_records and are very close to correct, which is what I care

558

# about.

559

# If we ever decide to require cython:

560

# return (sizeof(GCCHKSHA1LeafNode)

561

# + sizeof(gc_chk_sha1_record)*self.num_records)

562

return (sizeof(PyObject) + sizeof(void*) + sizeof(int)

563

+ sizeof(gc_chk_sha1_record*) + sizeof(PyObject *)

564

+ sizeof(gc_chk_sha1_record*) + sizeof(char)

565

+ sizeof(unsigned char)*257

566

+ sizeof(gc_chk_sha1_record)*self.num_records)

567

568

def __dealloc__(self):

569

if self.records != NULL:

570

PyMem_Free(self.records)

571

self.records = NULL

572

573

def __init__(self, bytes):

574

self._parse_bytes(bytes)

575

self.last_key = None

576

self.last_record = NULL

577

578

property min_key:

579

def __get__(self):

580

if self.num_records > 0:

581

return _sha1_to_key(self.records[0].sha1)

582

return None

583

584

property max_key:

585

def __get__(self):

586

if self.num_records > 0:

587

return _sha1_to_key(self.records[self.num_records-1].sha1)

588

return None

589

590

cdef StaticTuple _record_to_value_and_refs(self,

591

gc_chk_sha1_record *record):

592

"""Extract the refs and value part of this record."""

593

cdef StaticTuple value_and_refs

594

cdef StaticTuple empty

595

value_and_refs = StaticTuple_New(2)

596

value = _record_formatter(record)

597

Py_INCREF(value)

598

StaticTuple_SET_ITEM(value_and_refs, 0, value)

599

# Always empty refs

600

empty = StaticTuple_New(0)

601

Py_INCREF(empty)

602

StaticTuple_SET_ITEM(value_and_refs, 1, empty)

603

return value_and_refs

604

605

cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):

606

"""Turn a given record back into a fully fledged item.

607

"""

608

cdef StaticTuple item

609

cdef StaticTuple key

610

cdef StaticTuple value_and_refs

611

cdef object value

612

key = _sha1_to_key(record.sha1)

613

item = StaticTuple_New(2)

614

Py_INCREF(key)

615

StaticTuple_SET_ITEM(item, 0, key)

616

value_and_refs = self._record_to_value_and_refs(record)

617

Py_INCREF(value_and_refs)

618

StaticTuple_SET_ITEM(item, 1, value_and_refs)

619

return item

620

621

cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:

622

"""Find a gc_chk_sha1_record that matches the sha1 supplied."""

623

cdef int lo, hi, mid, the_cmp

624

cdef int offset

625

626

# TODO: We can speed up misses by comparing this sha1 to the common

627

# bits, and seeing if the common prefix matches, if not, we don't

628

# need to search for anything because it cannot match

629

# Use the offset array to find the closest fit for this entry

630

# follow that up with bisecting, since multiple keys can be in one

631

# spot

632

# Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using

633

# the offset array dropped us from 23us to 20us and 156 comparisions

634

# (1.3/key)

635

offset = self._offset_for_sha1(sha1)

636

lo = self.offsets[offset]

637

hi = self.offsets[offset+1]

638

if hi == 255:

639

# if hi == 255 that means we potentially ran off the end of the

640

# list, so push it up to num_records

641

# note that if 'lo' == 255, that is ok, because we can start

642

# searching from that part of the list.

643

hi = self.num_records

644

local_n_cmp = 0

645

while lo < hi:

646

mid = (lo + hi) / 2

647

the_cmp = memcmp(self.records[mid].sha1, sha1, 20)

648

if the_cmp == 0:

649

return &self.records[mid]

650

elif the_cmp < 0:

651

lo = mid + 1

652

else:

653

hi = mid

654

return NULL

655

656

def __contains__(self, key):

657

cdef char sha1[20]

658

cdef gc_chk_sha1_record *record

659

if _key_to_sha1(key, sha1):

660

# If it isn't a sha1 key, then it won't be in this leaf node

661

record = self._lookup_record(sha1)

662

if record != NULL:

663

self.last_key = key

664

self.last_record = record

665

return True

666

return False

667

668

def __getitem__(self, key):

669

cdef char sha1[20]

670

cdef gc_chk_sha1_record *record

671

record = NULL

672

if self.last_record != NULL and key is self.last_key:

673

record = self.last_record

674

elif _key_to_sha1(key, sha1):

675

record = self._lookup_record(sha1)

676

if record == NULL:

677

raise KeyError('key %r is not present' % (key,))

678

return self._record_to_value_and_refs(record)

679

680

def __len__(self):

681

return self.num_records

682

683

def all_keys(self):

684

cdef int i

685

result = []

686

for i from 0 <= i < self.num_records:

687

PyList_Append(result, _sha1_to_key(self.records[i].sha1))

688

return result

689

690

def all_items(self):

691

cdef int i

692

result = []

693

for i from 0 <= i < self.num_records:

694

item = self._record_to_item(&self.records[i])

695

PyList_Append(result, item)

696

return result

697

698

cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise

699

"""Count how many records are in this section."""

700

cdef char *c_cur

701

cdef int num_records

702

703

c_cur = c_content

704

num_records = 0

705

while c_cur != NULL and c_cur < c_end:

706

c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);

707

if c_cur == NULL:

708

break

709

c_cur = c_cur + 1

710

num_records = num_records + 1

711

return num_records

712

713

cdef _parse_bytes(self, bytes):

714

"""Parse the string 'bytes' into content."""

715

cdef char *c_bytes

716

cdef char *c_cur

717

cdef char *c_end

718

cdef Py_ssize_t n_bytes

719

cdef int num_records

720

cdef int entry

721

cdef gc_chk_sha1_record *cur_record

722

723

if not PyString_CheckExact(bytes):

724

raise TypeError('We only support parsing plain 8-bit strings.')

725

# Pass 1, count how many records there will be

726

n_bytes = PyString_GET_SIZE(bytes)

727

c_bytes = PyString_AS_STRING(bytes)

728

c_end = c_bytes + n_bytes

729

if strncmp(c_bytes, 'type=leaf\n', 10):

730

raise ValueError("bytes did not start with 'type=leaf\\n': %r"

731

% (bytes[:10],))

732

c_cur = c_bytes + 10

733

num_records = self._count_records(c_cur, c_end)

734

# Now allocate the memory for these items, and go to town

735

self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *

736

(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))

737

self.num_records = num_records

738

cur_record = self.records

739

entry = 0

740

while c_cur != NULL and c_cur < c_end and entry < num_records:

741

c_cur = self._parse_one_entry(c_cur, c_end, cur_record)

742

cur_record = cur_record + 1

743

entry = entry + 1

744

if (entry != self.num_records

745

or c_cur != c_end

746

or cur_record != self.records + self.num_records):

747

raise ValueError('Something went wrong while parsing.')

748

# Pass 3: build the offset map

749

self._compute_common()

750

751

cdef char *_parse_one_entry(self, char *c_cur, char *c_end,

752

gc_chk_sha1_record *cur_record) except NULL:

753

"""Read a single sha record from the bytes.

754

:param c_cur: The pointer to the start of bytes

755

:param cur_record:

756

"""

757

cdef char *c_next

758

if strncmp(c_cur, 'sha1:', 5):

759

raise ValueError('line did not start with sha1: %r'

760

% (safe_string_from_size(c_cur, 10),))

761

c_cur = c_cur + 5

762

c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)

763

if c_next == NULL or (c_next - c_cur != 40):

764

raise ValueError('Line did not contain 40 hex bytes')

765

if not _unhexlify_sha1(c_cur, cur_record.sha1):

766

raise ValueError('We failed to unhexlify')

767

c_cur = c_next + 1

768

if c_cur[0] != c'\0':

769

raise ValueError('only 1 null, not 2 as expected')

770

c_cur = c_cur + 1

771

cur_record.block_offset = strtoll(c_cur, &c_next, 10)

772

if c_cur == c_next or c_next[0] != c' ':

773

raise ValueError('Failed to parse block offset')

774

c_cur = c_next + 1

775

cur_record.block_length = strtoul(c_cur, &c_next, 10)

776

if c_cur == c_next or c_next[0] != c' ':

777

raise ValueError('Failed to parse block length')

778

c_cur = c_next + 1

779

cur_record.record_start = strtoul(c_cur, &c_next, 10)

780

if c_cur == c_next or c_next[0] != c' ':

781

raise ValueError('Failed to parse block length')

782

c_cur = c_next + 1

783

cur_record.record_end = strtoul(c_cur, &c_next, 10)

784

if c_cur == c_next or c_next[0] != c'\n':

785

raise ValueError('Failed to parse record end')

786

c_cur = c_next + 1

787

return c_cur

788

789

cdef int _offset_for_sha1(self, char *sha1) except -1:

790

"""Find the first interesting 8-bits of this sha1."""

791

cdef int this_offset

792

cdef unsigned int as_uint

793

as_uint = _sha1_to_uint(sha1)

794

this_offset = (as_uint >> self.common_shift) & 0xFF

795

return this_offset

796

797

def _get_offset_for_sha1(self, sha1):

798

return self._offset_for_sha1(PyString_AS_STRING(sha1))

799

800

cdef _compute_common(self):

801

cdef unsigned int first

802

cdef unsigned int this

803

cdef unsigned int common_mask

804

cdef unsigned char common_shift

805

cdef int i

806

cdef int offset, this_offset

807

cdef int max_offset

808

# The idea with the offset map is that we should be able to quickly

809

# jump to the key that matches a gives sha1. We know that the keys are

810

# in sorted order, and we know that a lot of the prefix is going to be

811

# the same across them.

812

# By XORing the records together, we can determine what bits are set in

813

# all of them

814

if self.num_records < 2:

815

# Everything is in common if you have 0 or 1 leaves

816

# So we'll always just shift to the first byte

817

self.common_shift = 24

818

else:

819

common_mask = 0xFFFFFFFF

820

first = _sha1_to_uint(self.records[0].sha1)

821

for i from 0 < i < self.num_records:

822

this = _sha1_to_uint(self.records[i].sha1)

823

common_mask = (~(first ^ this)) & common_mask

824

common_shift = 24

825

while common_mask & 0x80000000 and common_shift > 0:

826

common_mask = common_mask << 1

827

common_shift = common_shift - 1

828

self.common_shift = common_shift

829

offset = 0

830

max_offset = self.num_records

831

# We cap this loop at 254 records. All the other offsets just get

832

# filled with 0xff as the singleton saying 'too many'.

833

# It means that if we have >255 records we have to bisect the second

834

# half of the list, but this is going to be very rare in practice.

835

if max_offset > 255:

836

max_offset = 255

837

for i from 0 <= i < max_offset:

838

this_offset = self._offset_for_sha1(self.records[i].sha1)

839

while offset <= this_offset:

840

self.offsets[offset] = i

841

offset = offset + 1

842

while offset < 257:

843

self.offsets[offset] = max_offset

844

offset = offset + 1

845

846

def _get_offsets(self):

847

cdef int i

848

result = []

849

for i from 0 <= i < 257:

850

PyList_Append(result, self.offsets[i])

851

return result

852

853

854

def _parse_into_chk(bytes, key_length, ref_list_length):

855

"""Parse into a format optimized for chk records."""

856

assert key_length == 1

857

assert ref_list_length == 0

858

return GCCHKSHA1LeafNode(bytes)

859

860

861

318

def _flatten_node(node, reference_lists):

862

319

"""Convert a node into the serialized form.

863

320

Older »