315
322
return parser.parse()
325
ctypedef struct gc_chk_sha1_record:
326
unsigned long long block_offset
327
unsigned int block_length
328
unsigned int record_start
329
unsigned int record_end
333
cdef int _unhexbuf[256]
334
cdef char *_hexbuf = '01234567890abcdef'
336
cdef _populate_unhexbuf():
338
for a from 0 <= a < 255:
340
for a in '0123456789':
341
_unhexbuf[a] = a - c'0'
343
_unhexbuf[a] = a - c'a' + 10
345
_unhexbuf[a] = a - c'A' + 10
348
cdef int _unhexlify_sha1(char *as_hex, char *as_bin):
349
"""Take the hex sha1 in as_hex and make it binary in as_bin"""
356
for i from 0 <= i < 20:
357
top = _unhexbuf[<unsigned char>(cur)]
359
bot = _unhexbuf[<unsigned char>(cur)]
361
if top == -1 or bot == -1:
363
as_bin[i] = (top << 4) + bot;
367
cdef void _hexlify_sha1(char *as_bin, char *as_hex):
372
for i from 0 <= i < 20:
374
as_hex[j] = _hexbuf[(c>>4)&0xf]
376
as_hex[j] = _hexbuf[(c)&0xf]
380
cdef class GCCHKSHA1LeafNode:
381
"""Track all the entries for a given leaf node."""
383
cdef public int num_entries
384
cdef gc_chk_sha1_record *entries
385
# TODO: Consider what a mini-index might look like. Something that could
386
# let us quickly jump to a subset range, rather than doing pure
387
# bisect all the time.
389
def __sizeof__(self):
390
return (sizeof(GCCHKSHA1LeafNode)
391
+ sizeof(gc_chk_sha1_record)*self.num_entries)
393
def __dealloc__(self):
394
if self.entries != NULL:
395
PyMem_Free(self.entries)
398
def __init__(self, bytes):
399
self._parse_bytes(bytes)
409
cdef int _key_to_sha1(self, key, char *sha1):
410
"""Map a key into its sha1 content.
412
:param key: A tuple of style ('sha1:abcd...',)
413
:param sha1: A char buffer of 20 bytes
414
:return: 1 if this could be converted, 0 otherwise
417
if not PyTuple_CheckExact(key) and not StaticTuple_CheckExact(key):
419
#? raise TypeError('Keys must be a tuple or StaticTuple')
423
if not PyString_CheckExact(val) or PyString_GET_SIZE(val) != 45:
425
c_val = PyString_AS_STRING(val)
426
if not strncmp(c_val, 'sha1:', 5):
428
if not _unhexlify_sha1(c_val + 5, sha1):
432
cdef StaticTuple _sha1_to_key(self, char *sha1):
433
"""Compute a ('sha1:abcd',) key for a given sha1."""
437
hexxed = PyString_FromStringAndSize(NULL, 45)
438
c_buf = PyString_AS_STRING(hexxed)
439
memcpy(c_buf, 'sha1:', 5)
440
_hexlify_sha1(sha1, c_buf+5)
441
key = StaticTuple_New(1)
443
StaticTuple_SET_ITEM(key, 0, hexxed)
446
cdef StaticTuple _record_to_value_and_refs(self,
447
gc_chk_sha1_record *record):
448
"""Extract the refs and value part of this record."""
449
cdef StaticTuple value_and_refs
450
cdef StaticTuple empty
451
value_and_refs = StaticTuple_New(2)
452
# This is really inefficient to go from a logical state back to a
453
# string, but it makes things work a bit better internally for now.
454
value = PyString_FromFormat('%llu %lu %lu %lu',
455
record.block_offset, record.block_length,
456
record.record_start, record.record_end)
458
StaticTuple_SET_ITEM(value_and_refs, 0, value)
460
empty = StaticTuple_New(0)
462
StaticTuple_SET_ITEM(value_and_refs, 1, empty)
463
return value_and_refs
465
cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
466
"""Turn a given record back into a fully fledged item.
468
cdef StaticTuple item
470
cdef StaticTuple value_and_refs
472
key = self._sha1_to_key(record.sha1)
473
item = StaticTuple_New(2)
475
StaticTuple_SET_ITEM(item, 0, key)
476
value_and_refs = self._record_to_value_and_refs(record)
477
Py_INCREF(value_and_refs)
478
StaticTuple_SET_ITEM(item, 1, value_and_refs)
481
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):
482
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
483
# For right now we iterate, in the future we should bisect, or create
484
# a local index, or use the sha1 as a hash into a local table, etc.
486
for i from 0 <= i < self.num_entries:
487
if memcmp(self.entries[i].sha1, sha1, 20) == 0:
488
return &self.entries[i]
491
def __contains__(self, key):
493
cdef gc_chk_sha1_record *record
494
if not self._key_to_sha1(key, sha1):
495
# If it isn't a sha1 key, then it won't be in this leaf node
497
return self._lookup_record(sha1) != NULL
499
def __getitem__(self, key):
501
cdef gc_chk_sha1_record *record = NULL
502
if self._key_to_sha1(key, sha1):
503
record = self._lookup_record(sha1)
505
raise KeyError('key %r is not present' % (key,))
506
return self._record_to_value_and_refs(record)
509
return self.num_entries
514
for i from 0 <= i < self.num_entries:
515
result.append(self._sha1_to_key(self.entries[i].sha1))
521
for i from 0 <= i < self.num_entries:
522
result.append(self._record_to_item(self.entries + i))
525
cdef _parse_bytes(self, bytes):
526
"""Parse the string 'bytes' into content."""
532
cdef char *c_sha1_prefix
534
cdef Py_ssize_t n_bytes
537
cdef int interesting_char
539
cdef gc_chk_sha1_record *cur_record
541
if not PyString_CheckExact(bytes):
542
raise TypeError('We only support parsing plain 8-bit strings.')
543
# Pass 1, count how many entries there will be
544
n_bytes = PyString_GET_SIZE(bytes)
545
c_bytes = PyString_AS_STRING(bytes)
546
c_end = c_bytes + n_bytes
547
if strncmp(c_bytes, 'type=leaf\n', 10):
548
raise ValueError("bytes did not start with 'type=leaf\\n': %r"
550
c_content = c_bytes + 10
553
while c_cur != NULL and c_cur < c_end:
554
c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
559
# Now allocate the memory for these items, and go to town
560
# We allocate both the offsets and the entries in the same malloc. we
561
# should probably pay a bit closer attention to alignment
562
buf = PyMem_Malloc(num_entries *
563
(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
564
self.entries = <gc_chk_sha1_record*>buf
565
self.num_entries = num_entries
567
cur_record = self.entries
569
interesting_char = -1
570
while c_cur != NULL and c_cur < c_end and entry < num_entries:
571
if strncmp(c_cur, 'sha1:', 5):
572
raise ValueError('At byte %d, line did not start with sha1: %r'
573
% (c_cur - c_bytes, safe_string_from_size(c_cur, 10)))
575
c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
576
if c_next == NULL or (c_next - c_cur != 40):
577
raise ValueError('Line did not contain 40 hex bytes')
578
if not _unhexlify_sha1(c_cur, cur_record.sha1):
579
raise ValueError('We failed to unhexlify')
580
if interesting_char == -1:
581
interesting_char = 20
582
c_sha1_prefix = cur_record.sha1
583
elif interesting_char > 0:
584
for i from 0 <= i < interesting_char:
585
if cur_record.sha1[i] != c_sha1_prefix[i]:
589
if c_cur[0] != c'\0':
590
raise ValueError('only 1 null, not 2 as expected')
592
cur_record.block_offset = strtoull(c_cur, &c_next, 10)
593
if c_cur == c_next or c_next[0] != c' ':
594
raise ValueError('Failed to parse block offset')
596
cur_record.block_length = strtoul(c_cur, &c_next, 10)
597
if c_cur == c_next or c_next[0] != c' ':
598
raise ValueError('Failed to parse block length')
600
cur_record.record_start = strtoul(c_cur, &c_next, 10)
601
if c_cur == c_next or c_next[0] != c' ':
602
raise ValueError('Failed to parse block length')
604
cur_record.record_end = strtoul(c_cur, &c_next, 10)
605
if c_cur == c_next or c_next[0] != c'\n':
606
raise ValueError('Failed to parse record end')
608
# Pass 3: build the offset map
609
# The idea with the offset map is that we should be able to quickly
610
# jump to the key that matches a gives sha1. We know that the keys are
611
# in sorted order, and we know that a lot of the prefix is going to be
612
# the same across them.
617
def _parse_into_chk(bytes, key_length, ref_list_length):
618
"""Parse into a format optimized for chk records."""
619
assert key_length == 1
620
assert ref_list_length == 0
621
return GCCHKSHA1LeafNode(bytes)
318
624
def _flatten_node(node, reference_lists):
319
625
"""Convert a node into the serialized form.