4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
4183.7.1
by Sabin Iacob
update FSF mailing address |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
3641.3.29
by John Arbash Meinel
Cleanup the copyright headers |
16 |
#
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
17 |
|
18 |
"""Pyrex extensions to btree node parsing."""
|
|
19 |
||
3731.1.1
by Robert Collins
* The C extensions now build on python 2.4 (Robert Collins, #271939) |
20 |
#python2.4 support
|
21 |
cdef extern from "python-compat.h": |
|
22 |
pass
|
|
23 |
||
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
24 |
cdef extern from "stdlib.h": |
25 |
ctypedef unsigned size_t |
|
26 |
||
27 |
cdef extern from "Python.h": |
|
3641.3.32
by John Arbash Meinel
PQM's pyrex version requires Py_ssize_t to be manually defined |
28 |
ctypedef int Py_ssize_t # Required for older pyrex versions |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
29 |
ctypedef struct PyObject: |
30 |
pass
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
31 |
int PyList_Append(object lst, object item) except -1 |
32 |
||
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
33 |
char *PyString_AsString(object p) except NULL |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
34 |
object PyString_FromStringAndSize(char *, Py_ssize_t) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
35 |
PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
36 |
object PyString_FromFormat(char *, ...) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
37 |
int PyString_CheckExact(object s) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
38 |
int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
39 |
Py_ssize_t PyString_Size(object p) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
40 |
Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *) |
41 |
char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
42 |
char * PyString_AS_STRING(object) |
43 |
Py_ssize_t PyString_GET_SIZE(object) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
44 |
int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
45 |
void PyString_InternInPlace(PyObject **) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
46 |
int PyTuple_CheckExact(object t) |
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
47 |
object PyTuple_New(Py_ssize_t n_entries) |
48 |
void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
49 |
Py_ssize_t PyTuple_GET_SIZE(object t) |
50 |
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index) |
|
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
51 |
void Py_INCREF(object) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
52 |
void Py_DECREF_ptr "Py_DECREF" (PyObject *) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
53 |
void *PyMem_Malloc(size_t nbytes) |
54 |
void PyMem_Free(void *) |
|
55 |
void memset(void *, int, size_t) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
56 |
|
57 |
cdef extern from "string.h": |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
58 |
void *memcpy(void *dest, void *src, size_t n) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
59 |
void *memchr(void *s, int c, size_t n) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
60 |
int memcmp(void *s1, void *s2, size_t n) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
61 |
# GNU extension
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
62 |
# void *memrchr(void *s, int c, size_t n)
|
63 |
int strncmp(char *s1, char *s2, size_t n) |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
64 |
unsigned long strtoul(char *s1, char **out, int base) |
5365.5.16
by John Arbash Meinel
Trim the class a little bit. |
65 |
long long strtoll(char *s1, char **out, int base) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
66 |
|
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
67 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
68 |
# It seems we need to import the definitions so that the pyrex compiler has
|
69 |
# local names to access them.
|
|
70 |
from _static_tuple_c cimport StaticTuple, \ |
|
71 |
import_static_tuple_c, StaticTuple_New, \ |
|
5365.5.11
by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function. |
72 |
StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \ |
73 |
StaticTuple_GET_SIZE, StaticTuple_GET_ITEM |
|
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
74 |
# This tells the test infrastructure that StaticTuple is a class, so we don't
|
75 |
# have to worry about exception checking.
|
|
76 |
## extern cdef class StaticTuple
|
|
5365.5.28
by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers. |
77 |
import sys |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
78 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
79 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
80 |
# TODO: Find some way to import this from _dirstate_helpers
|
4634.117.10
by John Arbash Meinel
Change 'no except' to 'cannot_raise' |
81 |
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
82 |
# memrchr seems to be a GNU extension, so we have to implement it ourselves
|
83 |
# It is not present in any win32 standard library
|
|
84 |
cdef char *pos |
|
85 |
cdef char *start |
|
86 |
||
87 |
start = <char*>s |
|
88 |
pos = start + n - 1 |
|
89 |
while pos >= start: |
|
90 |
if pos[0] == c: |
|
91 |
return <void*>pos |
|
92 |
pos = pos - 1 |
|
93 |
return NULL |
|
94 |
||
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
95 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
96 |
# TODO: Import this from _dirstate_helpers when it is merged
|
97 |
cdef object safe_string_from_size(char *s, Py_ssize_t size): |
|
98 |
if size < 0: |
|
99 |
raise AssertionError( |
|
100 |
'tried to create a string with an invalid size: %d @0x%x' |
|
101 |
% (size, <int>s)) |
|
102 |
return PyString_FromStringAndSize(s, size) |
|
103 |
||
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
104 |
|
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
105 |
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size): |
106 |
cdef PyObject *py_str |
|
107 |
if size < 0: |
|
108 |
raise AssertionError( |
|
109 |
'tried to create a string with an invalid size: %d @0x%x' |
|
110 |
% (size, <int>s)) |
|
111 |
py_str = PyString_FromStringAndSize_ptr(s, size) |
|
112 |
PyString_InternInPlace(&py_str) |
|
113 |
result = <object>py_str |
|
114 |
# Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
|
|
115 |
# DECREF it to avoid geting immortal strings
|
|
116 |
Py_DECREF_ptr(py_str) |
|
117 |
return result |
|
118 |
||
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
119 |
# This sets up the StaticTuple C_API functionality
|
120 |
import_static_tuple_c() |
|
121 |
||
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
122 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
123 |
cdef class BTreeLeafParser: |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
124 |
"""Parse the leaf nodes of a BTree index.
|
125 |
||
126 |
:ivar bytes: The PyString object containing the uncompressed text for the
|
|
127 |
node.
|
|
128 |
:ivar key_length: An integer describing how many pieces the keys have for
|
|
129 |
this index.
|
|
130 |
:ivar ref_list_length: An integer describing how many references this index
|
|
131 |
contains.
|
|
132 |
:ivar keys: A PyList of keys found in this node.
|
|
133 |
||
134 |
:ivar _cur_str: A pointer to the start of the next line to parse
|
|
135 |
:ivar _end_str: A pointer to the end of bytes
|
|
136 |
:ivar _start: Pointer to the location within the current line while
|
|
137 |
parsing.
|
|
138 |
:ivar _header_found: True when we have parsed the header for this node
|
|
139 |
"""
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
140 |
|
141 |
cdef object bytes |
|
142 |
cdef int key_length |
|
143 |
cdef int ref_list_length |
|
144 |
cdef object keys |
|
145 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
146 |
cdef char * _cur_str |
147 |
cdef char * _end_str |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
148 |
# The current start point for parsing
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
149 |
cdef char * _start |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
150 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
151 |
cdef int _header_found |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
152 |
|
153 |
def __init__(self, bytes, key_length, ref_list_length): |
|
154 |
self.bytes = bytes |
|
155 |
self.key_length = key_length |
|
156 |
self.ref_list_length = ref_list_length |
|
157 |
self.keys = [] |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
158 |
self._cur_str = NULL |
159 |
self._end_str = NULL |
|
160 |
self._header_found = 0 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
161 |
# keys are tuples
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
162 |
|
163 |
cdef extract_key(self, char * last): |
|
164 |
"""Extract a key.
|
|
165 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
166 |
:param last: points at the byte after the last byte permitted for the
|
167 |
key.
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
168 |
"""
|
169 |
cdef char *temp_ptr |
|
170 |
cdef int loop_counter |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
171 |
cdef StaticTuple key |
172 |
||
173 |
key = StaticTuple_New(self.key_length) |
|
4672.2.2
by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than |
174 |
for loop_counter from 0 <= loop_counter < self.key_length: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
175 |
# grab a key segment
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
176 |
temp_ptr = <char*>memchr(self._start, c'\0', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
177 |
if temp_ptr == NULL: |
4672.2.2
by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than |
178 |
if loop_counter + 1 == self.key_length: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
179 |
# capture to last
|
180 |
temp_ptr = last |
|
181 |
else: |
|
182 |
# Invalid line
|
|
183 |
failure_string = ("invalid key, wanted segment from " + |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
184 |
repr(safe_string_from_size(self._start, |
185 |
last - self._start))) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
186 |
raise AssertionError(failure_string) |
187 |
# capture the key string
|
|
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
188 |
if (self.key_length == 1 |
189 |
and (temp_ptr - self._start) == 45 |
|
190 |
and strncmp(self._start, 'sha1:', 5) == 0): |
|
191 |
key_element = safe_string_from_size(self._start, |
|
192 |
temp_ptr - self._start) |
|
193 |
else: |
|
194 |
key_element = safe_interned_string_from_size(self._start, |
|
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
195 |
temp_ptr - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
196 |
# advance our pointer
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
197 |
self._start = temp_ptr + 1 |
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
198 |
Py_INCREF(key_element) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
199 |
StaticTuple_SET_ITEM(key, loop_counter, key_element) |
200 |
key = StaticTuple_Intern(key) |
|
4679.3.48
by John Arbash Meinel
Prototype of not interning sha1: strings. |
201 |
return key |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
202 |
|
203 |
cdef int process_line(self) except -1: |
|
204 |
"""Process a line in the bytes."""
|
|
205 |
cdef char *last |
|
206 |
cdef char *temp_ptr |
|
207 |
cdef char *ref_ptr |
|
208 |
cdef char *next_start |
|
209 |
cdef int loop_counter |
|
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
210 |
cdef Py_ssize_t str_len |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
211 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
212 |
self._start = self._cur_str |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
213 |
# Find the next newline
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
214 |
last = <char*>memchr(self._start, c'\n', self._end_str - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
215 |
if last == NULL: |
216 |
# Process until the end of the file
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
217 |
last = self._end_str |
218 |
self._cur_str = self._end_str |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
219 |
else: |
220 |
# And the next string is right after it
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
221 |
self._cur_str = last + 1 |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
222 |
# The last character is right before the '\n'
|
223 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
224 |
if last == self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
225 |
# parsed it all.
|
226 |
return 0 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
227 |
if last < self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
228 |
# Unexpected error condition - fail
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
229 |
raise AssertionError("last < self._start") |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
230 |
if 0 == self._header_found: |
231 |
# The first line in a leaf node is the header "type=leaf\n"
|
|
232 |
if strncmp("type=leaf", self._start, last - self._start) == 0: |
|
233 |
self._header_found = 1 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
234 |
return 0 |
235 |
else: |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
236 |
raise AssertionError('Node did not start with "type=leaf": %r' |
237 |
% (safe_string_from_size(self._start, last - self._start))) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
238 |
|
239 |
key = self.extract_key(last) |
|
240 |
# find the value area
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
241 |
temp_ptr = <char*>_my_memrchr(self._start, c'\0', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
242 |
if temp_ptr == NULL: |
243 |
# Invalid line
|
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
244 |
raise AssertionError("Failed to find the value area") |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
245 |
else: |
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
246 |
# Because of how conversions were done, we ended up with *lots* of
|
247 |
# values that are identical. These are all of the 0-length nodes
|
|
248 |
# that are referred to by the TREE_ROOT (and likely some other
|
|
249 |
# directory nodes.) For example, bzr has 25k references to
|
|
250 |
# something like '12607215 328306 0 0', which ends up consuming 1MB
|
|
251 |
# of memory, just for those strings.
|
|
252 |
str_len = last - temp_ptr - 1 |
|
253 |
if (str_len > 4 |
|
254 |
and strncmp(" 0 0", last - 4, 4) == 0): |
|
255 |
# This drops peak mem for bzr.dev from 87.4MB => 86.2MB
|
|
256 |
# For Launchpad 236MB => 232MB
|
|
257 |
value = safe_interned_string_from_size(temp_ptr + 1, str_len) |
|
258 |
else: |
|
259 |
value = safe_string_from_size(temp_ptr + 1, str_len) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
260 |
# shrink the references end point
|
261 |
last = temp_ptr |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
262 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
263 |
if self.ref_list_length: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
264 |
ref_lists = StaticTuple_New(self.ref_list_length) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
265 |
loop_counter = 0 |
266 |
while loop_counter < self.ref_list_length: |
|
267 |
ref_list = [] |
|
268 |
# extract a reference list
|
|
269 |
loop_counter = loop_counter + 1 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
270 |
if last < self._start: |
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
271 |
raise AssertionError("last < self._start") |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
272 |
# find the next reference list end point:
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
273 |
temp_ptr = <char*>memchr(self._start, c'\t', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
274 |
if temp_ptr == NULL: |
275 |
# Only valid for the last list
|
|
276 |
if loop_counter != self.ref_list_length: |
|
277 |
# Invalid line
|
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
278 |
raise AssertionError( |
279 |
"invalid key, loop_counter != self.ref_list_length") |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
280 |
else: |
281 |
# scan to the end of the ref list area
|
|
282 |
ref_ptr = last |
|
283 |
next_start = last |
|
284 |
else: |
|
285 |
# scan to the end of this ref list
|
|
286 |
ref_ptr = temp_ptr |
|
287 |
next_start = temp_ptr + 1 |
|
288 |
# Now, there may be multiple keys in the ref list.
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
289 |
while self._start < ref_ptr: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
290 |
# loop finding keys and extracting them
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
291 |
temp_ptr = <char*>memchr(self._start, c'\r', |
292 |
ref_ptr - self._start) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
293 |
if temp_ptr == NULL: |
294 |
# key runs to the end
|
|
295 |
temp_ptr = ref_ptr |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
296 |
|
4679.3.28
by John Arbash Meinel
Add a KEY_HAS_HASH define. |
297 |
PyList_Append(ref_list, self.extract_key(temp_ptr)) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
298 |
ref_list = StaticTuple_Intern(StaticTuple(*ref_list)) |
299 |
Py_INCREF(ref_list) |
|
300 |
StaticTuple_SET_ITEM(ref_lists, loop_counter - 1, ref_list) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
301 |
# prepare for the next reference list
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
302 |
self._start = next_start |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
303 |
node_value = StaticTuple(value, ref_lists) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
304 |
else: |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
305 |
if last != self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
306 |
# unexpected reference data present
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
307 |
raise AssertionError("unexpected reference data present") |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
308 |
node_value = StaticTuple(value, StaticTuple()) |
309 |
PyList_Append(self.keys, StaticTuple(key, node_value)) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
310 |
return 0 |
311 |
||
312 |
def parse(self): |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
313 |
cdef Py_ssize_t byte_count |
314 |
if not PyString_CheckExact(self.bytes): |
|
315 |
raise AssertionError('self.bytes is not a string.') |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
316 |
byte_count = PyString_Size(self.bytes) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
317 |
self._cur_str = PyString_AsString(self.bytes) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
318 |
# This points to the last character in the string
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
319 |
self._end_str = self._cur_str + byte_count |
320 |
while self._cur_str < self._end_str: |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
321 |
self.process_line() |
322 |
return self.keys |
|
323 |
||
324 |
||
325 |
def _parse_leaf_lines(bytes, key_length, ref_list_length): |
|
326 |
parser = BTreeLeafParser(bytes, key_length, ref_list_length) |
|
327 |
return parser.parse() |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
328 |
|
329 |
||
5365.5.2
by John Arbash Meinel
Parse times are dramatically improved. |
330 |
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
|
331 |
# because the block_offset + length is likely to be repeated. However,
|
|
332 |
# the big win there is to cache across pages, and not just one page
|
|
333 |
# Though if we did cache in a page, we could certainly use a short int.
|
|
334 |
# And this goes from 40 bytes to 30 bytes.
|
|
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
335 |
# One slightly ugly option would be to cache block offsets in a global.
|
336 |
# However, that leads to thread-safety issues, etc.
|
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
337 |
ctypedef struct gc_chk_sha1_record: |
5365.5.8
by John Arbash Meinel
Add tests that we handle sizes close to and >2GB properly. |
338 |
long long block_offset |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
339 |
unsigned int block_length |
340 |
unsigned int record_start |
|
341 |
unsigned int record_end |
|
342 |
char sha1[20] |
|
343 |
||
344 |
||
345 |
cdef int _unhexbuf[256] |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
346 |
cdef char *_hexbuf |
347 |
_hexbuf = '0123456789abcdef' |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
348 |
|
349 |
cdef _populate_unhexbuf(): |
|
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
350 |
cdef int i |
351 |
for i from 0 <= i < 256: |
|
352 |
_unhexbuf[i] = -1 |
|
353 |
for i from 0 <= i < 10: # 0123456789 => map to the raw number |
|
354 |
_unhexbuf[(i + c'0')] = i |
|
355 |
for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16 |
|
356 |
_unhexbuf[(i - 10 + c'a')] = i |
|
357 |
for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16 |
|
358 |
_unhexbuf[(i - 10 + c'A')] = i |
|
359 |
_populate_unhexbuf() |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
360 |
|
361 |
||
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
362 |
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
363 |
"""Take the hex sha1 in as_hex and make it binary in as_bin
|
364 |
|
|
365 |
Same as binascii.unhexlify, but working on C strings, not Python objects.
|
|
366 |
"""
|
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
367 |
cdef int top |
368 |
cdef int bot |
|
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
369 |
cdef int i, j |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
370 |
cdef char *cur |
371 |
||
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
372 |
# binascii does this using isupper() and tolower() and ?: syntax. I'm
|
373 |
# guessing a simple lookup array should be faster.
|
|
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
374 |
j = 0 |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
375 |
for i from 0 <= i < 20: |
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
376 |
top = _unhexbuf[<unsigned char>(as_hex[j])] |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
377 |
j = j + 1 |
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
378 |
bot = _unhexbuf[<unsigned char>(as_hex[j])] |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
379 |
j = j + 1 |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
380 |
if top == -1 or bot == -1: |
381 |
return 0 |
|
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
382 |
as_bin[i] = <unsigned char>((top << 4) + bot); |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
383 |
return 1 |
384 |
||
385 |
||
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
386 |
def _py_unhexlify(as_hex): |
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
387 |
"""For the test infrastructure, just thunks to _unhexlify_sha1"""
|
388 |
if len(as_hex) != 40 or not PyString_CheckExact(as_hex): |
|
389 |
raise ValueError('not a 40-byte hex digest') |
|
390 |
as_bin = PyString_FromStringAndSize(NULL, 20) |
|
391 |
if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)): |
|
392 |
return as_bin |
|
393 |
return None |
|
394 |
||
395 |
||
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
396 |
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
397 |
cdef int i, j |
398 |
cdef char c |
|
399 |
||
400 |
j = 0 |
|
401 |
for i from 0 <= i < 20: |
|
402 |
c = as_bin[i] |
|
403 |
as_hex[j] = _hexbuf[(c>>4)&0xf] |
|
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
404 |
j = j + 1 |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
405 |
as_hex[j] = _hexbuf[(c)&0xf] |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
406 |
j = j + 1 |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
407 |
|
408 |
||
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
409 |
def _py_hexlify(as_bin): |
5365.5.4
by John Arbash Meinel
Some direct tests of the hex and unhex functions. |
410 |
"""For test infrastructure, thunk to _hexlify_sha1"""
|
411 |
if len(as_bin) != 20 or not PyString_CheckExact(as_bin): |
|
412 |
raise ValueError('not a 20-byte binary digest') |
|
413 |
as_hex = PyString_FromStringAndSize(NULL, 40) |
|
414 |
_hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex)) |
|
415 |
return as_hex |
|
416 |
||
417 |
||
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
418 |
cdef int _key_to_sha1(key, char *sha1): # cannot_raise |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
419 |
"""Map a key into its sha1 content.
|
420 |
||
421 |
:param key: A tuple of style ('sha1:abcd...',)
|
|
422 |
:param sha1: A char buffer of 20 bytes
|
|
423 |
:return: 1 if this could be converted, 0 otherwise
|
|
424 |
"""
|
|
425 |
cdef char *c_val |
|
5365.5.11
by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function. |
426 |
cdef PyObject *p_val |
427 |
||
428 |
if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1: |
|
429 |
p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0) |
|
430 |
elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1): |
|
431 |
p_val = PyTuple_GET_ITEM_ptr_object(key, 0) |
|
432 |
else: |
|
433 |
# Not a tuple or a StaticTuple
|
|
434 |
return 0 |
|
435 |
if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45): |
|
436 |
c_val = PyString_AS_STRING_ptr(p_val) |
|
437 |
else: |
|
438 |
return 0 |
|
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
439 |
if strncmp(c_val, 'sha1:', 5) != 0: |
440 |
return 0 |
|
441 |
if not _unhexlify_sha1(c_val + 5, sha1): |
|
442 |
return 0 |
|
443 |
return 1 |
|
444 |
||
445 |
||
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
446 |
def _py_key_to_sha1(key): |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
447 |
"""Map a key to a simple sha1 string.
|
448 |
||
449 |
This is a testing thunk to the C function.
|
|
450 |
"""
|
|
451 |
as_bin_sha = PyString_FromStringAndSize(NULL, 20) |
|
452 |
if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)): |
|
453 |
return as_bin_sha |
|
454 |
return None |
|
455 |
||
456 |
||
457 |
cdef StaticTuple _sha1_to_key(char *sha1): |
|
458 |
"""Compute a ('sha1:abcd',) key for a given sha1."""
|
|
459 |
cdef StaticTuple key |
|
460 |
cdef object hexxed |
|
461 |
cdef char *c_buf |
|
462 |
hexxed = PyString_FromStringAndSize(NULL, 45) |
|
463 |
c_buf = PyString_AS_STRING(hexxed) |
|
464 |
memcpy(c_buf, 'sha1:', 5) |
|
465 |
_hexlify_sha1(sha1, c_buf+5) |
|
466 |
key = StaticTuple_New(1) |
|
467 |
Py_INCREF(hexxed) |
|
468 |
StaticTuple_SET_ITEM(key, 0, hexxed) |
|
5365.5.7
by John Arbash Meinel
Document some perf decisions. |
469 |
# This is a bit expensive. To parse 120 keys takes 48us, to return them all
|
470 |
# can be done in 66.6us (so 18.6us to build them all).
|
|
471 |
# Adding simple hash() here brings it to 76.6us (so computing the hash
|
|
472 |
# value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
|
|
473 |
# them to the intern structure.)
|
|
474 |
# However, since we only intern keys that are in active use, it is probably
|
|
475 |
# a win. Since they would have been read from elsewhere anyway.
|
|
476 |
# We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
|
|
477 |
# that we have deserialized. Something to think about, at least.
|
|
5365.5.6
by John Arbash Meinel
For a single page test, we parse() still takes 47us |
478 |
key = StaticTuple_Intern(key) |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
479 |
return key |
480 |
||
481 |
||
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
482 |
def _py_sha1_to_key(sha1_bin): |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
483 |
"""Test thunk to check the sha1 mapping."""
|
484 |
if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20: |
|
485 |
raise ValueError('sha1_bin must be a str of exactly 20 bytes') |
|
486 |
return _sha1_to_key(PyString_AS_STRING(sha1_bin)) |
|
487 |
||
488 |
||
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
489 |
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
490 |
cdef unsigned int val |
491 |
# Must be in MSB, because that is how the content is sorted
|
|
492 |
val = (((<unsigned int>(sha1[0]) & 0xff) << 24) |
|
493 |
| ((<unsigned int>(sha1[1]) & 0xff) << 16) |
|
494 |
| ((<unsigned int>(sha1[2]) & 0xff) << 8) |
|
495 |
| ((<unsigned int>(sha1[3]) & 0xff) << 0)) |
|
496 |
return val |
|
497 |
||
498 |
||
5365.5.28
by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers. |
499 |
cdef _format_record_py24(gc_chk_sha1_record *record): |
500 |
"""Python2.4 PyString_FromFormat doesn't have %u.
|
|
501 |
||
502 |
It only has %d and %ld. We would really like to even have %llu, which
|
|
503 |
is only in python2.7. So we go back into casting to regular objects.
|
|
504 |
"""
|
|
505 |
return "%s %s %s %s" % (record.block_offset, record.block_length, |
|
506 |
record.record_start, record.record_end) |
|
507 |
||
508 |
||
509 |
cdef _format_record(gc_chk_sha1_record *record): |
|
510 |
# This is inefficient to go from a logical state back to a
|
|
511 |
# string, but it makes things work a bit better internally for now.
|
|
512 |
if record.block_offset >= 0xFFFFFFFF: |
|
513 |
# %llu is what we really want, but unfortunately it was only added
|
|
514 |
# in python 2.7... :(
|
|
515 |
block_offset_str = str(record.block_offset) |
|
516 |
value = PyString_FromFormat('%s %lu %lu %lu', |
|
517 |
PyString_AS_STRING(block_offset_str), |
|
518 |
record.block_length, |
|
519 |
record.record_start, record.record_end) |
|
520 |
else: |
|
521 |
value = PyString_FromFormat('%lu %lu %lu %lu', |
|
522 |
<unsigned long>record.block_offset, |
|
523 |
record.block_length, |
|
524 |
record.record_start, record.record_end) |
|
525 |
return value |
|
526 |
||
527 |
ctypedef object (*formatproc)(gc_chk_sha1_record *) |
|
528 |
cdef formatproc _record_formatter |
|
529 |
_record_formatter = _format_record |
|
530 |
if sys.version_info[:2] == (2, 4): |
|
531 |
_record_formatter = _format_record_py24 |
|
532 |
||
533 |
||
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
534 |
cdef class GCCHKSHA1LeafNode: |
535 |
"""Track all the entries for a given leaf node."""
|
|
536 |
||
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
537 |
cdef gc_chk_sha1_record *records |
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
538 |
cdef public object last_key |
539 |
cdef gc_chk_sha1_record *last_record |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
540 |
cdef public int num_records |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
541 |
# This is the number of bits to shift to get to the interesting byte. A
|
542 |
# value of 24 means that the very first byte changes across all keys.
|
|
543 |
# Anything else means that there is a common prefix of bits that we can
|
|
544 |
# ignore. 0 means that at least the first 3 bytes are identical, though
|
|
545 |
# that is going to be very rare
|
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
546 |
cdef public unsigned char common_shift |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
547 |
# This maps an interesting byte to the first record that matches.
|
548 |
# Equivalent to bisect.bisect_left(self.records, sha1), though only taking
|
|
549 |
# into account that one byte.
|
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
550 |
cdef unsigned char offsets[257] |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
551 |
|
552 |
def __sizeof__(self): |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
553 |
# :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)
|
554 |
# like Cython? Explicitly enumerating everything here seems to leave my
|
|
555 |
# size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an
|
|
556 |
# alignment/padding issue. Oh well- at least we scale properly with
|
|
557 |
# num_records and are very close to correct, which is what I care
|
|
558 |
# about.
|
|
559 |
# If we ever decide to require cython:
|
|
560 |
# return (sizeof(GCCHKSHA1LeafNode)
|
|
561 |
# + sizeof(gc_chk_sha1_record)*self.num_records)
|
|
562 |
return (sizeof(PyObject) + sizeof(void*) + sizeof(int) |
|
563 |
+ sizeof(gc_chk_sha1_record*) + sizeof(PyObject *) |
|
564 |
+ sizeof(gc_chk_sha1_record*) + sizeof(char) |
|
565 |
+ sizeof(unsigned char)*257 |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
566 |
+ sizeof(gc_chk_sha1_record)*self.num_records) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
567 |
|
568 |
def __dealloc__(self): |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
569 |
if self.records != NULL: |
570 |
PyMem_Free(self.records) |
|
571 |
self.records = NULL |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
572 |
|
573 |
def __init__(self, bytes): |
|
574 |
self._parse_bytes(bytes) |
|
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
575 |
self.last_key = None |
576 |
self.last_record = NULL |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
577 |
|
578 |
property min_key: |
|
579 |
def __get__(self): |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
580 |
if self.num_records > 0: |
581 |
return _sha1_to_key(self.records[0].sha1) |
|
5365.5.2
by John Arbash Meinel
Parse times are dramatically improved. |
582 |
return None |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
583 |
|
584 |
property max_key: |
|
585 |
def __get__(self): |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
586 |
if self.num_records > 0: |
587 |
return _sha1_to_key(self.records[self.num_records-1].sha1) |
|
5365.5.2
by John Arbash Meinel
Parse times are dramatically improved. |
588 |
return None |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
589 |
|
590 |
cdef StaticTuple _record_to_value_and_refs(self, |
|
591 |
gc_chk_sha1_record *record): |
|
592 |
"""Extract the refs and value part of this record."""
|
|
593 |
cdef StaticTuple value_and_refs |
|
594 |
cdef StaticTuple empty |
|
595 |
value_and_refs = StaticTuple_New(2) |
|
5365.5.28
by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers. |
596 |
value = _record_formatter(record) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
597 |
Py_INCREF(value) |
598 |
StaticTuple_SET_ITEM(value_and_refs, 0, value) |
|
599 |
# Always empty refs
|
|
600 |
empty = StaticTuple_New(0) |
|
601 |
Py_INCREF(empty) |
|
602 |
StaticTuple_SET_ITEM(value_and_refs, 1, empty) |
|
603 |
return value_and_refs |
|
604 |
||
605 |
cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record): |
|
606 |
"""Turn a given record back into a fully fledged item.
|
|
607 |
"""
|
|
608 |
cdef StaticTuple item |
|
609 |
cdef StaticTuple key |
|
610 |
cdef StaticTuple value_and_refs |
|
611 |
cdef object value |
|
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
612 |
key = _sha1_to_key(record.sha1) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
613 |
item = StaticTuple_New(2) |
614 |
Py_INCREF(key) |
|
615 |
StaticTuple_SET_ITEM(item, 0, key) |
|
616 |
value_and_refs = self._record_to_value_and_refs(record) |
|
617 |
Py_INCREF(value_and_refs) |
|
618 |
StaticTuple_SET_ITEM(item, 1, value_and_refs) |
|
619 |
return item |
|
620 |
||
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
621 |
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL: |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
622 |
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
|
5365.5.21
by John Arbash Meinel
Remove counters, start the cleanup |
623 |
cdef int lo, hi, mid, the_cmp |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
624 |
cdef int offset |
5365.5.9
by John Arbash Meinel
Add some tests that key lookup works. |
625 |
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
626 |
# TODO: We can speed up misses by comparing this sha1 to the common
|
627 |
# bits, and seeing if the common prefix matches, if not, we don't
|
|
628 |
# need to search for anything because it cannot match
|
|
629 |
# Use the offset array to find the closest fit for this entry
|
|
630 |
# follow that up with bisecting, since multiple keys can be in one
|
|
631 |
# spot
|
|
632 |
# Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
|
|
633 |
# the offset array dropped us from 23us to 20us and 156 comparisions
|
|
634 |
# (1.3/key)
|
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
635 |
offset = self._offset_for_sha1(sha1) |
636 |
lo = self.offsets[offset] |
|
637 |
hi = self.offsets[offset+1] |
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
638 |
if hi == 255: |
639 |
# if hi == 255 that means we potentially ran off the end of the
|
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
640 |
# list, so push it up to num_records
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
641 |
# note that if 'lo' == 255, that is ok, because we can start
|
642 |
# searching from that part of the list.
|
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
643 |
hi = self.num_records |
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
644 |
local_n_cmp = 0 |
5365.5.9
by John Arbash Meinel
Add some tests that key lookup works. |
645 |
while lo < hi: |
646 |
mid = (lo + hi) / 2 |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
647 |
the_cmp = memcmp(self.records[mid].sha1, sha1, 20) |
5365.5.9
by John Arbash Meinel
Add some tests that key lookup works. |
648 |
if the_cmp == 0: |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
649 |
return &self.records[mid] |
5365.5.9
by John Arbash Meinel
Add some tests that key lookup works. |
650 |
elif the_cmp < 0: |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
651 |
lo = mid + 1 |
652 |
else: |
|
5365.5.9
by John Arbash Meinel
Add some tests that key lookup works. |
653 |
hi = mid |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
654 |
return NULL |
655 |
||
656 |
def __contains__(self, key): |
|
657 |
cdef char sha1[20] |
|
658 |
cdef gc_chk_sha1_record *record |
|
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
659 |
if _key_to_sha1(key, sha1): |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
660 |
# If it isn't a sha1 key, then it won't be in this leaf node
|
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
661 |
record = self._lookup_record(sha1) |
662 |
if record != NULL: |
|
663 |
self.last_key = key |
|
664 |
self.last_record = record |
|
665 |
return True |
|
666 |
return False |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
667 |
|
668 |
def __getitem__(self, key): |
|
669 |
cdef char sha1[20] |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
670 |
cdef gc_chk_sha1_record *record |
671 |
record = NULL |
|
5365.5.19
by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache. |
672 |
if self.last_record != NULL and key is self.last_key: |
673 |
record = self.last_record |
|
674 |
elif _key_to_sha1(key, sha1): |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
675 |
record = self._lookup_record(sha1) |
676 |
if record == NULL: |
|
677 |
raise KeyError('key %r is not present' % (key,)) |
|
678 |
return self._record_to_value_and_refs(record) |
|
679 |
||
680 |
def __len__(self): |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
681 |
return self.num_records |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
682 |
|
683 |
def all_keys(self): |
|
684 |
cdef int i |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
685 |
result = [] |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
686 |
for i from 0 <= i < self.num_records: |
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
687 |
PyList_Append(result, _sha1_to_key(self.records[i].sha1)) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
688 |
return result |
689 |
||
690 |
def all_items(self): |
|
691 |
cdef int i |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
692 |
result = [] |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
693 |
for i from 0 <= i < self.num_records: |
694 |
item = self._record_to_item(&self.records[i]) |
|
5365.5.22
by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand. |
695 |
PyList_Append(result, item) |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
696 |
return result |
697 |
||
5365.5.29
by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow |
698 |
cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
699 |
"""Count how many records are in this section."""
|
700 |
cdef char *c_cur |
|
701 |
cdef int num_records |
|
702 |
||
703 |
c_cur = c_content |
|
704 |
num_records = 0 |
|
705 |
while c_cur != NULL and c_cur < c_end: |
|
706 |
c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur); |
|
707 |
if c_cur == NULL: |
|
708 |
break
|
|
709 |
c_cur = c_cur + 1 |
|
710 |
num_records = num_records + 1 |
|
711 |
return num_records |
|
712 |
||
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
713 |
cdef _parse_bytes(self, bytes): |
714 |
"""Parse the string 'bytes' into content."""
|
|
715 |
cdef char *c_bytes |
|
716 |
cdef char *c_cur |
|
717 |
cdef char *c_end |
|
718 |
cdef Py_ssize_t n_bytes |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
719 |
cdef int num_records |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
720 |
cdef int entry |
721 |
cdef gc_chk_sha1_record *cur_record |
|
722 |
||
723 |
if not PyString_CheckExact(bytes): |
|
724 |
raise TypeError('We only support parsing plain 8-bit strings.') |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
725 |
# Pass 1, count how many records there will be
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
726 |
n_bytes = PyString_GET_SIZE(bytes) |
727 |
c_bytes = PyString_AS_STRING(bytes) |
|
728 |
c_end = c_bytes + n_bytes |
|
729 |
if strncmp(c_bytes, 'type=leaf\n', 10): |
|
730 |
raise ValueError("bytes did not start with 'type=leaf\\n': %r" |
|
731 |
% (bytes[:10],)) |
|
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
732 |
c_cur = c_bytes + 10 |
733 |
num_records = self._count_records(c_cur, c_end) |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
734 |
# Now allocate the memory for these items, and go to town
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
735 |
self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records * |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
736 |
(sizeof(unsigned short) + sizeof(gc_chk_sha1_record))) |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
737 |
self.num_records = num_records |
738 |
cur_record = self.records |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
739 |
entry = 0 |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
740 |
while c_cur != NULL and c_cur < c_end and entry < num_records: |
5365.5.16
by John Arbash Meinel
Trim the class a little bit. |
741 |
c_cur = self._parse_one_entry(c_cur, c_end, cur_record) |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
742 |
cur_record = cur_record + 1 |
743 |
entry = entry + 1 |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
744 |
if (entry != self.num_records |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
745 |
or c_cur != c_end |
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
746 |
or cur_record != self.records + self.num_records): |
5365.5.5
by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers, |
747 |
raise ValueError('Something went wrong while parsing.') |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
748 |
# Pass 3: build the offset map
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
749 |
self._compute_common() |
750 |
||
5365.5.16
by John Arbash Meinel
Trim the class a little bit. |
751 |
cdef char *_parse_one_entry(self, char *c_cur, char *c_end, |
752 |
gc_chk_sha1_record *cur_record) except NULL: |
|
753 |
"""Read a single sha record from the bytes.
|
|
754 |
:param c_cur: The pointer to the start of bytes
|
|
755 |
:param cur_record:
|
|
756 |
"""
|
|
757 |
cdef char *c_next |
|
758 |
if strncmp(c_cur, 'sha1:', 5): |
|
759 |
raise ValueError('line did not start with sha1: %r' |
|
760 |
% (safe_string_from_size(c_cur, 10),)) |
|
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
761 |
c_cur = c_cur + 5 |
5365.5.16
by John Arbash Meinel
Trim the class a little bit. |
762 |
c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur) |
763 |
if c_next == NULL or (c_next - c_cur != 40): |
|
764 |
raise ValueError('Line did not contain 40 hex bytes') |
|
765 |
if not _unhexlify_sha1(c_cur, cur_record.sha1): |
|
766 |
raise ValueError('We failed to unhexlify') |
|
767 |
c_cur = c_next + 1 |
|
768 |
if c_cur[0] != c'\0': |
|
769 |
raise ValueError('only 1 null, not 2 as expected') |
|
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
770 |
c_cur = c_cur + 1 |
5365.5.16
by John Arbash Meinel
Trim the class a little bit. |
771 |
cur_record.block_offset = strtoll(c_cur, &c_next, 10) |
772 |
if c_cur == c_next or c_next[0] != c' ': |
|
773 |
raise ValueError('Failed to parse block offset') |
|
774 |
c_cur = c_next + 1 |
|
775 |
cur_record.block_length = strtoul(c_cur, &c_next, 10) |
|
776 |
if c_cur == c_next or c_next[0] != c' ': |
|
777 |
raise ValueError('Failed to parse block length') |
|
778 |
c_cur = c_next + 1 |
|
779 |
cur_record.record_start = strtoul(c_cur, &c_next, 10) |
|
780 |
if c_cur == c_next or c_next[0] != c' ': |
|
781 |
raise ValueError('Failed to parse block length') |
|
782 |
c_cur = c_next + 1 |
|
783 |
cur_record.record_end = strtoul(c_cur, &c_next, 10) |
|
784 |
if c_cur == c_next or c_next[0] != c'\n': |
|
785 |
raise ValueError('Failed to parse record end') |
|
786 |
c_cur = c_next + 1 |
|
787 |
return c_cur |
|
788 |
||
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
789 |
cdef int _offset_for_sha1(self, char *sha1) except -1: |
790 |
"""Find the first interesting 8-bits of this sha1."""
|
|
791 |
cdef int this_offset |
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
792 |
cdef unsigned int as_uint |
793 |
as_uint = _sha1_to_uint(sha1) |
|
794 |
this_offset = (as_uint >> self.common_shift) & 0xFF |
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
795 |
return this_offset |
796 |
||
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
797 |
def _get_offset_for_sha1(self, sha1): |
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
798 |
return self._offset_for_sha1(PyString_AS_STRING(sha1)) |
799 |
||
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
800 |
cdef _compute_common(self): |
801 |
cdef unsigned int first |
|
802 |
cdef unsigned int this |
|
803 |
cdef unsigned int common_mask |
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
804 |
cdef unsigned char common_shift |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
805 |
cdef int i |
806 |
cdef int offset, this_offset |
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
807 |
cdef int max_offset |
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
808 |
# The idea with the offset map is that we should be able to quickly
|
809 |
# jump to the key that matches a gives sha1. We know that the keys are
|
|
810 |
# in sorted order, and we know that a lot of the prefix is going to be
|
|
811 |
# the same across them.
|
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
812 |
# By XORing the records together, we can determine what bits are set in
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
813 |
# all of them
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
814 |
if self.num_records < 2: |
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
815 |
# Everything is in common if you have 0 or 1 leaves
|
816 |
# So we'll always just shift to the first byte
|
|
817 |
self.common_shift = 24 |
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
818 |
else: |
819 |
common_mask = 0xFFFFFFFF |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
820 |
first = _sha1_to_uint(self.records[0].sha1) |
821 |
for i from 0 < i < self.num_records: |
|
822 |
this = _sha1_to_uint(self.records[i].sha1) |
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
823 |
common_mask = (~(first ^ this)) & common_mask |
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
824 |
common_shift = 24 |
825 |
while common_mask & 0x80000000 and common_shift > 0: |
|
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
826 |
common_mask = common_mask << 1 |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
827 |
common_shift = common_shift - 1 |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
828 |
self.common_shift = common_shift |
829 |
offset = 0 |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
830 |
max_offset = self.num_records |
831 |
# We cap this loop at 254 records. All the other offsets just get
|
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
832 |
# filled with 0xff as the singleton saying 'too many'.
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
833 |
# It means that if we have >255 records we have to bisect the second
|
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
834 |
# half of the list, but this is going to be very rare in practice.
|
835 |
if max_offset > 255: |
|
836 |
max_offset = 255 |
|
837 |
for i from 0 <= i < max_offset: |
|
5365.5.17
by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups. |
838 |
this_offset = self._offset_for_sha1(self.records[i].sha1) |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
839 |
while offset <= this_offset: |
840 |
self.offsets[offset] = i |
|
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
841 |
offset = offset + 1 |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
842 |
while offset < 257: |
5365.5.15
by John Arbash Meinel
Handle the case where there are more than 255 entries. |
843 |
self.offsets[offset] = max_offset |
5365.5.27
by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex. |
844 |
offset = offset + 1 |
5365.5.13
by John Arbash Meinel
Populate the offsets array. |
845 |
|
5365.5.24
by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :) |
846 |
def _get_offsets(self): |
847 |
cdef int i |
|
848 |
result = [] |
|
849 |
for i from 0 <= i < 257: |
|
850 |
PyList_Append(result, self.offsets[i]) |
|
851 |
return result |
|
5365.5.1
by John Arbash Meinel
Implement a custom parser for chk btree leaves. |
852 |
|
853 |
||
854 |
def _parse_into_chk(bytes, key_length, ref_list_length): |
|
855 |
"""Parse into a format optimized for chk records."""
|
|
856 |
assert key_length == 1 |
|
857 |
assert ref_list_length == 0 |
|
858 |
return GCCHKSHA1LeafNode(bytes) |
|
859 |
||
860 |
||
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
861 |
def _flatten_node(node, reference_lists): |
862 |
"""Convert a node into the serialized form.
|
|
863 |
||
864 |
:param node: A tuple representing a node:
|
|
865 |
(index, key_tuple, value, references)
|
|
866 |
:param reference_lists: Does this index have reference lists?
|
|
867 |
:return: (string_key, flattened)
|
|
868 |
string_key The serialized key for referencing this node
|
|
869 |
flattened A string with the serialized form for the contents
|
|
870 |
"""
|
|
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
871 |
cdef int have_reference_lists |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
872 |
cdef Py_ssize_t flat_len |
873 |
cdef Py_ssize_t key_len |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
874 |
cdef Py_ssize_t node_len |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
875 |
cdef char * value |
876 |
cdef Py_ssize_t value_len |
|
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
877 |
cdef char * out |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
878 |
cdef Py_ssize_t refs_len |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
879 |
cdef Py_ssize_t next_len |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
880 |
cdef int first_ref_list |
3641.3.22
by John Arbash Meinel
flatten the next level |
881 |
cdef int first_reference |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
882 |
cdef int i |
883 |
cdef Py_ssize_t ref_bit_len |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
884 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
885 |
if not PyTuple_CheckExact(node) and not StaticTuple_CheckExact(node): |
886 |
raise TypeError('We expected a tuple() or StaticTuple() for node not: %s' |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
887 |
% type(node)) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
888 |
node_len = len(node) |
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
889 |
have_reference_lists = reference_lists |
890 |
if have_reference_lists: |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
891 |
if node_len != 4: |
892 |
raise ValueError('With ref_lists, we expected 4 entries not: %s' |
|
893 |
% len(node)) |
|
894 |
elif node_len < 3: |
|
895 |
raise ValueError('Without ref_lists, we need at least 3 entries not: %s' |
|
896 |
% len(node)) |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
897 |
# TODO: We can probably do better than string.join(), namely
|
898 |
# when key has only 1 item, we can just grab that string
|
|
899 |
# And when there are 2 items, we could do a single malloc + len() + 1
|
|
900 |
# also, doing .join() requires a PyObject_GetAttrString call, which
|
|
901 |
# we could also avoid.
|
|
4679.8.4
by John Arbash Meinel
Comment about compiled code quality. |
902 |
# TODO: Note that pyrex 0.9.6 generates fairly crummy code here, using the
|
903 |
# python object interface, versus 0.9.8+ which uses a helper that
|
|
904 |
# checks if this supports the sequence interface.
|
|
905 |
# We *could* do more work on our own, and grab the actual items
|
|
906 |
# lists. For now, just ask people to use a better compiler. :)
|
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
907 |
string_key = '\0'.join(node[1]) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
908 |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
909 |
# TODO: instead of using string joins, precompute the final string length,
|
910 |
# and then malloc a single string and copy everything in.
|
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
911 |
|
912 |
# TODO: We probably want to use PySequenceFast, because we have lists and
|
|
913 |
# tuples, but we aren't sure which we will get.
|
|
914 |
||
915 |
# line := string_key NULL flat_refs NULL value LF
|
|
916 |
# string_key := BYTES (NULL BYTES)*
|
|
917 |
# flat_refs := ref_list (TAB ref_list)*
|
|
918 |
# ref_list := ref (CR ref)*
|
|
919 |
# ref := BYTES (NULL BYTES)*
|
|
920 |
# value := BYTES
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
921 |
refs_len = 0 |
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
922 |
if have_reference_lists: |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
923 |
# Figure out how many bytes it will take to store the references
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
924 |
ref_lists = node[3] |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
925 |
next_len = len(ref_lists) # TODO: use a Py function |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
926 |
if next_len > 0: |
927 |
# If there are no nodes, we don't need to do any work
|
|
928 |
# Otherwise we will need (len - 1) '\t' characters to separate
|
|
929 |
# the reference lists
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
930 |
refs_len = refs_len + (next_len - 1) |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
931 |
for ref_list in ref_lists: |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
932 |
next_len = len(ref_list) |
933 |
if next_len > 0: |
|
934 |
# We will need (len - 1) '\r' characters to separate the
|
|
935 |
# references
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
936 |
refs_len = refs_len + (next_len - 1) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
937 |
for reference in ref_list: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
938 |
if (not PyTuple_CheckExact(reference) |
939 |
and not StaticTuple_CheckExact(reference)): |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
940 |
raise TypeError( |
941 |
'We expect references to be tuples not: %s' |
|
942 |
% type(reference)) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
943 |
next_len = len(reference) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
944 |
if next_len > 0: |
945 |
# We will need (len - 1) '\x00' characters to
|
|
946 |
# separate the reference key
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
947 |
refs_len = refs_len + (next_len - 1) |
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
948 |
for ref_bit in reference: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
949 |
if not PyString_CheckExact(ref_bit): |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
950 |
raise TypeError('We expect reference bits' |
951 |
' to be strings not: %s' |
|
952 |
% type(<object>ref_bit)) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
953 |
refs_len = refs_len + PyString_GET_SIZE(ref_bit) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
954 |
|
955 |
# So we have the (key NULL refs NULL value LF)
|
|
956 |
key_len = PyString_Size(string_key) |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
957 |
val = node[2] |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
958 |
if not PyString_CheckExact(val): |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
959 |
raise TypeError('Expected a plain str for value not: %s' |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
960 |
% type(val)) |
961 |
value = PyString_AS_STRING(val) |
|
962 |
value_len = PyString_GET_SIZE(val) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
963 |
flat_len = (key_len + 1 + refs_len + 1 + value_len + 1) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
964 |
line = PyString_FromStringAndSize(NULL, flat_len) |
965 |
# Get a pointer to the new buffer
|
|
966 |
out = PyString_AsString(line) |
|
967 |
memcpy(out, PyString_AsString(string_key), key_len) |
|
968 |
out = out + key_len |
|
969 |
out[0] = c'\0' |
|
970 |
out = out + 1 |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
971 |
if refs_len > 0: |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
972 |
first_ref_list = 1 |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
973 |
for ref_list in ref_lists: |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
974 |
if first_ref_list == 0: |
975 |
out[0] = c'\t' |
|
976 |
out = out + 1 |
|
977 |
first_ref_list = 0 |
|
3641.3.22
by John Arbash Meinel
flatten the next level |
978 |
first_reference = 1 |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
979 |
for reference in ref_list: |
3641.3.22
by John Arbash Meinel
flatten the next level |
980 |
if first_reference == 0: |
981 |
out[0] = c'\r' |
|
982 |
out = out + 1 |
|
983 |
first_reference = 0 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
984 |
next_len = len(reference) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
985 |
for i from 0 <= i < next_len: |
986 |
if i != 0: |
|
3641.3.23
by John Arbash Meinel
Flattened all the way down the stack. |
987 |
out[0] = c'\x00' |
988 |
out = out + 1 |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
989 |
ref_bit = reference[i] |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
990 |
ref_bit_len = PyString_GET_SIZE(ref_bit) |
991 |
memcpy(out, PyString_AS_STRING(ref_bit), ref_bit_len) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
992 |
out = out + ref_bit_len |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
993 |
out[0] = c'\0' |
3641.3.22
by John Arbash Meinel
flatten the next level |
994 |
out = out + 1 |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
995 |
memcpy(out, value, value_len) |
996 |
out = out + value_len |
|
997 |
out[0] = c'\n' |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
998 |
return string_key, line |