1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
"""Pyrex extensions to btree node parsing."""
21
cdef extern from "python-compat.h":
24
cdef extern from "stdlib.h":
25
ctypedef unsigned size_t
27
cdef extern from "Python.h":
28
ctypedef int Py_ssize_t # Required for older pyrex versions
29
ctypedef struct PyObject:
31
int PyList_Append(object lst, object item) except -1
33
char *PyString_AsString(object p) except NULL
34
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
36
object PyString_FromFormat(char *, ...)
37
int PyString_CheckExact(object s)
38
int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
39
Py_ssize_t PyString_Size(object p)
40
Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *)
41
char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *)
42
char * PyString_AS_STRING(object)
43
Py_ssize_t PyString_GET_SIZE(object)
44
int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len)
45
void PyString_InternInPlace(PyObject **)
46
int PyTuple_CheckExact(object t)
47
object PyTuple_New(Py_ssize_t n_entries)
48
void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref
49
Py_ssize_t PyTuple_GET_SIZE(object t)
50
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
51
void Py_INCREF(object)
52
void Py_DECREF_ptr "Py_DECREF" (PyObject *)
53
void *PyMem_Malloc(size_t nbytes)
54
void PyMem_Free(void *)
55
void memset(void *, int, size_t)
57
cdef extern from "string.h":
58
void *memcpy(void *dest, void *src, size_t n)
59
void *memchr(void *s, int c, size_t n)
60
int memcmp(void *s1, void *s2, size_t n)
62
# void *memrchr(void *s, int c, size_t n)
63
int strncmp(char *s1, char *s2, size_t n)
64
unsigned long strtoul(char *s1, char **out, int base)
65
long long strtoll(char *s1, char **out, int base)
68
# It seems we need to import the definitions so that the pyrex compiler has
69
# local names to access them.
70
from _static_tuple_c cimport StaticTuple, \
71
import_static_tuple_c, StaticTuple_New, \
72
StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \
73
StaticTuple_GET_SIZE, StaticTuple_GET_ITEM
74
# This tells the test infrastructure that StaticTuple is a class, so we don't
75
# have to worry about exception checking.
76
## extern cdef class StaticTuple
80
# TODO: Find some way to import this from _dirstate_helpers
81
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
82
# memrchr seems to be a GNU extension, so we have to implement it ourselves
83
# It is not present in any win32 standard library
96
# TODO: Import this from _dirstate_helpers when it is merged
97
cdef object safe_string_from_size(char *s, Py_ssize_t size):
100
'tried to create a string with an invalid size: %d @0x%x'
102
return PyString_FromStringAndSize(s, size)
105
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
106
cdef PyObject *py_str
108
raise AssertionError(
109
'tried to create a string with an invalid size: %d @0x%x'
111
py_str = PyString_FromStringAndSize_ptr(s, size)
112
PyString_InternInPlace(&py_str)
113
result = <object>py_str
114
# Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
115
# DECREF it to avoid geting immortal strings
116
Py_DECREF_ptr(py_str)
119
# This sets up the StaticTuple C_API functionality
120
import_static_tuple_c()
123
cdef class BTreeLeafParser:
124
"""Parse the leaf nodes of a BTree index.
126
:ivar bytes: The PyString object containing the uncompressed text for the
128
:ivar key_length: An integer describing how many pieces the keys have for
130
:ivar ref_list_length: An integer describing how many references this index
132
:ivar keys: A PyList of keys found in this node.
134
:ivar _cur_str: A pointer to the start of the next line to parse
135
:ivar _end_str: A pointer to the end of bytes
136
:ivar _start: Pointer to the location within the current line while
138
:ivar _header_found: True when we have parsed the header for this node
143
cdef int ref_list_length
148
# The current start point for parsing
151
cdef int _header_found
153
def __init__(self, bytes, key_length, ref_list_length):
155
self.key_length = key_length
156
self.ref_list_length = ref_list_length
160
self._header_found = 0
163
cdef extract_key(self, char * last):
166
:param last: points at the byte after the last byte permitted for the
170
cdef int loop_counter
173
key = StaticTuple_New(self.key_length)
174
for loop_counter from 0 <= loop_counter < self.key_length:
176
temp_ptr = <char*>memchr(self._start, c'\0', last - self._start)
178
if loop_counter + 1 == self.key_length:
183
failure_string = ("invalid key, wanted segment from " +
184
repr(safe_string_from_size(self._start,
185
last - self._start)))
186
raise AssertionError(failure_string)
187
# capture the key string
188
if (self.key_length == 1
189
and (temp_ptr - self._start) == 45
190
and strncmp(self._start, 'sha1:', 5) == 0):
191
key_element = safe_string_from_size(self._start,
192
temp_ptr - self._start)
194
key_element = safe_interned_string_from_size(self._start,
195
temp_ptr - self._start)
196
# advance our pointer
197
self._start = temp_ptr + 1
198
Py_INCREF(key_element)
199
StaticTuple_SET_ITEM(key, loop_counter, key_element)
200
key = StaticTuple_Intern(key)
203
cdef int process_line(self) except -1:
204
"""Process a line in the bytes."""
208
cdef char *next_start
209
cdef int loop_counter
210
cdef Py_ssize_t str_len
212
self._start = self._cur_str
213
# Find the next newline
214
last = <char*>memchr(self._start, c'\n', self._end_str - self._start)
216
# Process until the end of the file
218
self._cur_str = self._end_str
220
# And the next string is right after it
221
self._cur_str = last + 1
222
# The last character is right before the '\n'
224
if last == self._start:
227
if last < self._start:
228
# Unexpected error condition - fail
229
raise AssertionError("last < self._start")
230
if 0 == self._header_found:
231
# The first line in a leaf node is the header "type=leaf\n"
232
if strncmp("type=leaf", self._start, last - self._start) == 0:
233
self._header_found = 1
236
raise AssertionError('Node did not start with "type=leaf": %r'
237
% (safe_string_from_size(self._start, last - self._start)))
239
key = self.extract_key(last)
240
# find the value area
241
temp_ptr = <char*>_my_memrchr(self._start, c'\0', last - self._start)
244
raise AssertionError("Failed to find the value area")
246
# Because of how conversions were done, we ended up with *lots* of
247
# values that are identical. These are all of the 0-length nodes
248
# that are referred to by the TREE_ROOT (and likely some other
249
# directory nodes.) For example, bzr has 25k references to
250
# something like '12607215 328306 0 0', which ends up consuming 1MB
251
# of memory, just for those strings.
252
str_len = last - temp_ptr - 1
254
and strncmp(" 0 0", last - 4, 4) == 0):
255
# This drops peak mem for bzr.dev from 87.4MB => 86.2MB
256
# For Launchpad 236MB => 232MB
257
value = safe_interned_string_from_size(temp_ptr + 1, str_len)
259
value = safe_string_from_size(temp_ptr + 1, str_len)
260
# shrink the references end point
263
if self.ref_list_length:
264
ref_lists = StaticTuple_New(self.ref_list_length)
266
while loop_counter < self.ref_list_length:
268
# extract a reference list
269
loop_counter = loop_counter + 1
270
if last < self._start:
271
raise AssertionError("last < self._start")
272
# find the next reference list end point:
273
temp_ptr = <char*>memchr(self._start, c'\t', last - self._start)
275
# Only valid for the last list
276
if loop_counter != self.ref_list_length:
278
raise AssertionError(
279
"invalid key, loop_counter != self.ref_list_length")
281
# scan to the end of the ref list area
285
# scan to the end of this ref list
287
next_start = temp_ptr + 1
288
# Now, there may be multiple keys in the ref list.
289
while self._start < ref_ptr:
290
# loop finding keys and extracting them
291
temp_ptr = <char*>memchr(self._start, c'\r',
292
ref_ptr - self._start)
294
# key runs to the end
297
PyList_Append(ref_list, self.extract_key(temp_ptr))
298
ref_list = StaticTuple_Intern(StaticTuple(*ref_list))
300
StaticTuple_SET_ITEM(ref_lists, loop_counter - 1, ref_list)
301
# prepare for the next reference list
302
self._start = next_start
303
node_value = StaticTuple(value, ref_lists)
305
if last != self._start:
306
# unexpected reference data present
307
raise AssertionError("unexpected reference data present")
308
node_value = StaticTuple(value, StaticTuple())
309
PyList_Append(self.keys, StaticTuple(key, node_value))
313
cdef Py_ssize_t byte_count
314
if not PyString_CheckExact(self.bytes):
315
raise AssertionError('self.bytes is not a string.')
316
byte_count = PyString_Size(self.bytes)
317
self._cur_str = PyString_AsString(self.bytes)
318
# This points to the last character in the string
319
self._end_str = self._cur_str + byte_count
320
while self._cur_str < self._end_str:
325
def _parse_leaf_lines(bytes, key_length, ref_list_length):
326
parser = BTreeLeafParser(bytes, key_length, ref_list_length)
327
return parser.parse()
330
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
331
# because the block_offset + length is likely to be repeated. However,
332
# the big win there is to cache across pages, and not just one page
333
# Though if we did cache in a page, we could certainly use a short int.
334
# And this goes from 40 bytes to 30 bytes.
335
# One slightly ugly option would be to cache block offsets in a global.
336
# However, that leads to thread-safety issues, etc.
337
ctypedef struct gc_chk_sha1_record:
338
long long block_offset
339
unsigned int block_length
340
unsigned int record_start
341
unsigned int record_end
345
cdef int _unhexbuf[256]
347
_hexbuf = '0123456789abcdef'
349
cdef _populate_unhexbuf():
351
for i from 0 <= i < 256:
353
for i from 0 <= i < 10: # 0123456789 => map to the raw number
354
_unhexbuf[(i + c'0')] = i
355
for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16
356
_unhexbuf[(i - 10 + c'a')] = i
357
for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16
358
_unhexbuf[(i - 10 + c'A')] = i
362
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise
363
"""Take the hex sha1 in as_hex and make it binary in as_bin
365
Same as binascii.unhexlify, but working on C strings, not Python objects.
372
# binascii does this using isupper() and tolower() and ?: syntax. I'm
373
# guessing a simple lookup array should be faster.
375
for i from 0 <= i < 20:
376
top = _unhexbuf[<unsigned char>(as_hex[j])]
378
bot = _unhexbuf[<unsigned char>(as_hex[j])]
380
if top == -1 or bot == -1:
382
as_bin[i] = <unsigned char>((top << 4) + bot);
386
def _py_unhexlify(as_hex):
387
"""For the test infrastructure, just thunks to _unhexlify_sha1"""
388
if len(as_hex) != 40 or not PyString_CheckExact(as_hex):
389
raise ValueError('not a 40-byte hex digest')
390
as_bin = PyString_FromStringAndSize(NULL, 20)
391
if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)):
396
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise
401
for i from 0 <= i < 20:
403
as_hex[j] = _hexbuf[(c>>4)&0xf]
405
as_hex[j] = _hexbuf[(c)&0xf]
409
def _py_hexlify(as_bin):
410
"""For test infrastructure, thunk to _hexlify_sha1"""
411
if len(as_bin) != 20 or not PyString_CheckExact(as_bin):
412
raise ValueError('not a 20-byte binary digest')
413
as_hex = PyString_FromStringAndSize(NULL, 40)
414
_hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex))
418
cdef int _key_to_sha1(key, char *sha1): # cannot_raise
419
"""Map a key into its sha1 content.
421
:param key: A tuple of style ('sha1:abcd...',)
422
:param sha1: A char buffer of 20 bytes
423
:return: 1 if this could be converted, 0 otherwise
428
if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:
429
p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)
430
elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1):
431
p_val = PyTuple_GET_ITEM_ptr_object(key, 0)
433
# Not a tuple or a StaticTuple
435
if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45):
436
c_val = PyString_AS_STRING_ptr(p_val)
439
if strncmp(c_val, 'sha1:', 5) != 0:
441
if not _unhexlify_sha1(c_val + 5, sha1):
446
def _py_key_to_sha1(key):
447
"""Map a key to a simple sha1 string.
449
This is a testing thunk to the C function.
451
as_bin_sha = PyString_FromStringAndSize(NULL, 20)
452
if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):
457
cdef StaticTuple _sha1_to_key(char *sha1):
458
"""Compute a ('sha1:abcd',) key for a given sha1."""
462
hexxed = PyString_FromStringAndSize(NULL, 45)
463
c_buf = PyString_AS_STRING(hexxed)
464
memcpy(c_buf, 'sha1:', 5)
465
_hexlify_sha1(sha1, c_buf+5)
466
key = StaticTuple_New(1)
468
StaticTuple_SET_ITEM(key, 0, hexxed)
469
# This is a bit expensive. To parse 120 keys takes 48us, to return them all
470
# can be done in 66.6us (so 18.6us to build them all).
471
# Adding simple hash() here brings it to 76.6us (so computing the hash
472
# value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
473
# them to the intern structure.)
474
# However, since we only intern keys that are in active use, it is probably
475
# a win. Since they would have been read from elsewhere anyway.
476
# We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
477
# that we have deserialized. Something to think about, at least.
478
key = StaticTuple_Intern(key)
482
def _py_sha1_to_key(sha1_bin):
483
"""Test thunk to check the sha1 mapping."""
484
if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:
485
raise ValueError('sha1_bin must be a str of exactly 20 bytes')
486
return _sha1_to_key(PyString_AS_STRING(sha1_bin))
489
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise
490
cdef unsigned int val
491
# Must be in MSB, because that is how the content is sorted
492
val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
493
| ((<unsigned int>(sha1[1]) & 0xff) << 16)
494
| ((<unsigned int>(sha1[2]) & 0xff) << 8)
495
| ((<unsigned int>(sha1[3]) & 0xff) << 0))
499
cdef _format_record_py24(gc_chk_sha1_record *record):
500
"""Python2.4 PyString_FromFormat doesn't have %u.
502
It only has %d and %ld. We would really like to even have %llu, which
503
is only in python2.7. So we go back into casting to regular objects.
505
return "%s %s %s %s" % (record.block_offset, record.block_length,
506
record.record_start, record.record_end)
509
cdef _format_record(gc_chk_sha1_record *record):
510
# This is inefficient to go from a logical state back to a
511
# string, but it makes things work a bit better internally for now.
512
if record.block_offset >= 0xFFFFFFFF:
513
# %llu is what we really want, but unfortunately it was only added
514
# in python 2.7... :(
515
block_offset_str = str(record.block_offset)
516
value = PyString_FromFormat('%s %lu %lu %lu',
517
PyString_AS_STRING(block_offset_str),
519
record.record_start, record.record_end)
521
value = PyString_FromFormat('%lu %lu %lu %lu',
522
<unsigned long>record.block_offset,
524
record.record_start, record.record_end)
527
ctypedef object (*formatproc)(gc_chk_sha1_record *)
528
cdef formatproc _record_formatter
529
_record_formatter = _format_record
530
if sys.version_info[:2] == (2, 4):
531
_record_formatter = _format_record_py24
534
cdef class GCCHKSHA1LeafNode:
535
"""Track all the entries for a given leaf node."""
537
cdef gc_chk_sha1_record *records
538
cdef public object last_key
539
cdef gc_chk_sha1_record *last_record
540
cdef public int num_records
541
# This is the number of bits to shift to get to the interesting byte. A
542
# value of 24 means that the very first byte changes across all keys.
543
# Anything else means that there is a common prefix of bits that we can
544
# ignore. 0 means that at least the first 3 bytes are identical, though
545
# that is going to be very rare
546
cdef public unsigned char common_shift
547
# This maps an interesting byte to the first record that matches.
548
# Equivalent to bisect.bisect_left(self.records, sha1), though only taking
549
# into account that one byte.
550
cdef unsigned char offsets[257]
552
def __sizeof__(self):
553
# :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)
554
# like Cython? Explicitly enumerating everything here seems to leave my
555
# size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an
556
# alignment/padding issue. Oh well- at least we scale properly with
557
# num_records and are very close to correct, which is what I care
559
# If we ever decide to require cython:
560
# return (sizeof(GCCHKSHA1LeafNode)
561
# + sizeof(gc_chk_sha1_record)*self.num_records)
562
return (sizeof(PyObject) + sizeof(void*) + sizeof(int)
563
+ sizeof(gc_chk_sha1_record*) + sizeof(PyObject *)
564
+ sizeof(gc_chk_sha1_record*) + sizeof(char)
565
+ sizeof(unsigned char)*257
566
+ sizeof(gc_chk_sha1_record)*self.num_records)
568
def __dealloc__(self):
569
if self.records != NULL:
570
PyMem_Free(self.records)
573
def __init__(self, bytes):
574
self._parse_bytes(bytes)
576
self.last_record = NULL
580
if self.num_records > 0:
581
return _sha1_to_key(self.records[0].sha1)
586
if self.num_records > 0:
587
return _sha1_to_key(self.records[self.num_records-1].sha1)
590
cdef StaticTuple _record_to_value_and_refs(self,
591
gc_chk_sha1_record *record):
592
"""Extract the refs and value part of this record."""
593
cdef StaticTuple value_and_refs
594
cdef StaticTuple empty
595
value_and_refs = StaticTuple_New(2)
596
value = _record_formatter(record)
598
StaticTuple_SET_ITEM(value_and_refs, 0, value)
600
empty = StaticTuple_New(0)
602
StaticTuple_SET_ITEM(value_and_refs, 1, empty)
603
return value_and_refs
605
cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
606
"""Turn a given record back into a fully fledged item.
608
cdef StaticTuple item
610
cdef StaticTuple value_and_refs
612
key = _sha1_to_key(record.sha1)
613
item = StaticTuple_New(2)
615
StaticTuple_SET_ITEM(item, 0, key)
616
value_and_refs = self._record_to_value_and_refs(record)
617
Py_INCREF(value_and_refs)
618
StaticTuple_SET_ITEM(item, 1, value_and_refs)
621
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:
622
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
623
cdef int lo, hi, mid, the_cmp
626
# TODO: We can speed up misses by comparing this sha1 to the common
627
# bits, and seeing if the common prefix matches, if not, we don't
628
# need to search for anything because it cannot match
629
# Use the offset array to find the closest fit for this entry
630
# follow that up with bisecting, since multiple keys can be in one
632
# Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
633
# the offset array dropped us from 23us to 20us and 156 comparisions
635
offset = self._offset_for_sha1(sha1)
636
lo = self.offsets[offset]
637
hi = self.offsets[offset+1]
639
# if hi == 255 that means we potentially ran off the end of the
640
# list, so push it up to num_records
641
# note that if 'lo' == 255, that is ok, because we can start
642
# searching from that part of the list.
643
hi = self.num_records
647
the_cmp = memcmp(self.records[mid].sha1, sha1, 20)
649
return &self.records[mid]
656
def __contains__(self, key):
658
cdef gc_chk_sha1_record *record
659
if _key_to_sha1(key, sha1):
660
# If it isn't a sha1 key, then it won't be in this leaf node
661
record = self._lookup_record(sha1)
664
self.last_record = record
668
def __getitem__(self, key):
670
cdef gc_chk_sha1_record *record
672
if self.last_record != NULL and key is self.last_key:
673
record = self.last_record
674
elif _key_to_sha1(key, sha1):
675
record = self._lookup_record(sha1)
677
raise KeyError('key %r is not present' % (key,))
678
return self._record_to_value_and_refs(record)
681
return self.num_records
686
for i from 0 <= i < self.num_records:
687
PyList_Append(result, _sha1_to_key(self.records[i].sha1))
693
for i from 0 <= i < self.num_records:
694
item = self._record_to_item(&self.records[i])
695
PyList_Append(result, item)
698
cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise
699
"""Count how many records are in this section."""
705
while c_cur != NULL and c_cur < c_end:
706
c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
710
num_records = num_records + 1
713
cdef _parse_bytes(self, bytes):
714
"""Parse the string 'bytes' into content."""
718
cdef Py_ssize_t n_bytes
721
cdef gc_chk_sha1_record *cur_record
723
if not PyString_CheckExact(bytes):
724
raise TypeError('We only support parsing plain 8-bit strings.')
725
# Pass 1, count how many records there will be
726
n_bytes = PyString_GET_SIZE(bytes)
727
c_bytes = PyString_AS_STRING(bytes)
728
c_end = c_bytes + n_bytes
729
if strncmp(c_bytes, 'type=leaf\n', 10):
730
raise ValueError("bytes did not start with 'type=leaf\\n': %r"
733
num_records = self._count_records(c_cur, c_end)
734
# Now allocate the memory for these items, and go to town
735
self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *
736
(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
737
self.num_records = num_records
738
cur_record = self.records
740
while c_cur != NULL and c_cur < c_end and entry < num_records:
741
c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
742
cur_record = cur_record + 1
744
if (entry != self.num_records
746
or cur_record != self.records + self.num_records):
747
raise ValueError('Something went wrong while parsing.')
748
# Pass 3: build the offset map
749
self._compute_common()
751
cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
752
gc_chk_sha1_record *cur_record) except NULL:
753
"""Read a single sha record from the bytes.
754
:param c_cur: The pointer to the start of bytes
758
if strncmp(c_cur, 'sha1:', 5):
759
raise ValueError('line did not start with sha1: %r'
760
% (safe_string_from_size(c_cur, 10),))
762
c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
763
if c_next == NULL or (c_next - c_cur != 40):
764
raise ValueError('Line did not contain 40 hex bytes')
765
if not _unhexlify_sha1(c_cur, cur_record.sha1):
766
raise ValueError('We failed to unhexlify')
768
if c_cur[0] != c'\0':
769
raise ValueError('only 1 null, not 2 as expected')
771
cur_record.block_offset = strtoll(c_cur, &c_next, 10)
772
if c_cur == c_next or c_next[0] != c' ':
773
raise ValueError('Failed to parse block offset')
775
cur_record.block_length = strtoul(c_cur, &c_next, 10)
776
if c_cur == c_next or c_next[0] != c' ':
777
raise ValueError('Failed to parse block length')
779
cur_record.record_start = strtoul(c_cur, &c_next, 10)
780
if c_cur == c_next or c_next[0] != c' ':
781
raise ValueError('Failed to parse block length')
783
cur_record.record_end = strtoul(c_cur, &c_next, 10)
784
if c_cur == c_next or c_next[0] != c'\n':
785
raise ValueError('Failed to parse record end')
789
cdef int _offset_for_sha1(self, char *sha1) except -1:
790
"""Find the first interesting 8-bits of this sha1."""
792
cdef unsigned int as_uint
793
as_uint = _sha1_to_uint(sha1)
794
this_offset = (as_uint >> self.common_shift) & 0xFF
797
def _get_offset_for_sha1(self, sha1):
798
return self._offset_for_sha1(PyString_AS_STRING(sha1))
800
cdef _compute_common(self):
801
cdef unsigned int first
802
cdef unsigned int this
803
cdef unsigned int common_mask
804
cdef unsigned char common_shift
806
cdef int offset, this_offset
808
# The idea with the offset map is that we should be able to quickly
809
# jump to the key that matches a gives sha1. We know that the keys are
810
# in sorted order, and we know that a lot of the prefix is going to be
811
# the same across them.
812
# By XORing the records together, we can determine what bits are set in
814
if self.num_records < 2:
815
# Everything is in common if you have 0 or 1 leaves
816
# So we'll always just shift to the first byte
817
self.common_shift = 24
819
common_mask = 0xFFFFFFFF
820
first = _sha1_to_uint(self.records[0].sha1)
821
for i from 0 < i < self.num_records:
822
this = _sha1_to_uint(self.records[i].sha1)
823
common_mask = (~(first ^ this)) & common_mask
825
while common_mask & 0x80000000 and common_shift > 0:
826
common_mask = common_mask << 1
827
common_shift = common_shift - 1
828
self.common_shift = common_shift
830
max_offset = self.num_records
831
# We cap this loop at 254 records. All the other offsets just get
832
# filled with 0xff as the singleton saying 'too many'.
833
# It means that if we have >255 records we have to bisect the second
834
# half of the list, but this is going to be very rare in practice.
837
for i from 0 <= i < max_offset:
838
this_offset = self._offset_for_sha1(self.records[i].sha1)
839
while offset <= this_offset:
840
self.offsets[offset] = i
843
self.offsets[offset] = max_offset
846
def _get_offsets(self):
849
for i from 0 <= i < 257:
850
PyList_Append(result, self.offsets[i])
854
def _parse_into_chk(bytes, key_length, ref_list_length):
855
"""Parse into a format optimized for chk records."""
856
assert key_length == 1
857
assert ref_list_length == 0
858
return GCCHKSHA1LeafNode(bytes)
861
def _flatten_node(node, reference_lists):
862
"""Convert a node into the serialized form.
864
:param node: A tuple representing a node:
865
(index, key_tuple, value, references)
866
:param reference_lists: Does this index have reference lists?
867
:return: (string_key, flattened)
868
string_key The serialized key for referencing this node
869
flattened A string with the serialized form for the contents
871
cdef int have_reference_lists
872
cdef Py_ssize_t flat_len
873
cdef Py_ssize_t key_len
874
cdef Py_ssize_t node_len
876
cdef Py_ssize_t value_len
878
cdef Py_ssize_t refs_len
879
cdef Py_ssize_t next_len
880
cdef int first_ref_list
881
cdef int first_reference
883
cdef Py_ssize_t ref_bit_len
885
if not PyTuple_CheckExact(node) and not StaticTuple_CheckExact(node):
886
raise TypeError('We expected a tuple() or StaticTuple() for node not: %s'
889
have_reference_lists = reference_lists
890
if have_reference_lists:
892
raise ValueError('With ref_lists, we expected 4 entries not: %s'
895
raise ValueError('Without ref_lists, we need at least 3 entries not: %s'
897
# TODO: We can probably do better than string.join(), namely
898
# when key has only 1 item, we can just grab that string
899
# And when there are 2 items, we could do a single malloc + len() + 1
900
# also, doing .join() requires a PyObject_GetAttrString call, which
901
# we could also avoid.
902
# TODO: Note that pyrex 0.9.6 generates fairly crummy code here, using the
903
# python object interface, versus 0.9.8+ which uses a helper that
904
# checks if this supports the sequence interface.
905
# We *could* do more work on our own, and grab the actual items
906
# lists. For now, just ask people to use a better compiler. :)
907
string_key = '\0'.join(node[1])
909
# TODO: instead of using string joins, precompute the final string length,
910
# and then malloc a single string and copy everything in.
912
# TODO: We probably want to use PySequenceFast, because we have lists and
913
# tuples, but we aren't sure which we will get.
915
# line := string_key NULL flat_refs NULL value LF
916
# string_key := BYTES (NULL BYTES)*
917
# flat_refs := ref_list (TAB ref_list)*
918
# ref_list := ref (CR ref)*
919
# ref := BYTES (NULL BYTES)*
922
if have_reference_lists:
923
# Figure out how many bytes it will take to store the references
925
next_len = len(ref_lists) # TODO: use a Py function
927
# If there are no nodes, we don't need to do any work
928
# Otherwise we will need (len - 1) '\t' characters to separate
929
# the reference lists
930
refs_len = refs_len + (next_len - 1)
931
for ref_list in ref_lists:
932
next_len = len(ref_list)
934
# We will need (len - 1) '\r' characters to separate the
936
refs_len = refs_len + (next_len - 1)
937
for reference in ref_list:
938
if (not PyTuple_CheckExact(reference)
939
and not StaticTuple_CheckExact(reference)):
941
'We expect references to be tuples not: %s'
943
next_len = len(reference)
945
# We will need (len - 1) '\x00' characters to
946
# separate the reference key
947
refs_len = refs_len + (next_len - 1)
948
for ref_bit in reference:
949
if not PyString_CheckExact(ref_bit):
950
raise TypeError('We expect reference bits'
951
' to be strings not: %s'
952
% type(<object>ref_bit))
953
refs_len = refs_len + PyString_GET_SIZE(ref_bit)
955
# So we have the (key NULL refs NULL value LF)
956
key_len = PyString_Size(string_key)
958
if not PyString_CheckExact(val):
959
raise TypeError('Expected a plain str for value not: %s'
961
value = PyString_AS_STRING(val)
962
value_len = PyString_GET_SIZE(val)
963
flat_len = (key_len + 1 + refs_len + 1 + value_len + 1)
964
line = PyString_FromStringAndSize(NULL, flat_len)
965
# Get a pointer to the new buffer
966
out = PyString_AsString(line)
967
memcpy(out, PyString_AsString(string_key), key_len)
973
for ref_list in ref_lists:
974
if first_ref_list == 0:
979
for reference in ref_list:
980
if first_reference == 0:
984
next_len = len(reference)
985
for i from 0 <= i < next_len:
989
ref_bit = reference[i]
990
ref_bit_len = PyString_GET_SIZE(ref_bit)
991
memcpy(out, PyString_AS_STRING(ref_bit), ref_bit_len)
992
out = out + ref_bit_len
995
memcpy(out, value, value_len)
996
out = out + value_len
998
return string_key, line