4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
4183.7.1
by Sabin Iacob
update FSF mailing address |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
3641.3.29
by John Arbash Meinel
Cleanup the copyright headers |
16 |
#
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
17 |
|
18 |
"""Pyrex extensions to btree node parsing."""
|
|
19 |
||
3731.1.1
by Robert Collins
* The C extensions now build on python 2.4 (Robert Collins, #271939) |
20 |
#python2.4 support
|
21 |
cdef extern from "python-compat.h": |
|
22 |
pass
|
|
23 |
||
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
24 |
cdef extern from "stdlib.h": |
25 |
ctypedef unsigned size_t |
|
26 |
||
27 |
cdef extern from "Python.h": |
|
3641.3.32
by John Arbash Meinel
PQM's pyrex version requires Py_ssize_t to be manually defined |
28 |
ctypedef int Py_ssize_t # Required for older pyrex versions |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
29 |
ctypedef struct PyObject: |
30 |
pass
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
31 |
int PyList_Append(object lst, object item) except -1 |
32 |
||
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
33 |
char *PyString_AsString(object p) except NULL |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
34 |
object PyString_FromStringAndSize(char *, Py_ssize_t) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
35 |
PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
36 |
int PyString_CheckExact(object s) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
37 |
int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
38 |
Py_ssize_t PyString_Size(object p) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
39 |
Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *) |
40 |
char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
41 |
char * PyString_AS_STRING(object) |
42 |
Py_ssize_t PyString_GET_SIZE(object) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
43 |
int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
44 |
void PyString_InternInPlace(PyObject **) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
45 |
int PyTuple_CheckExact(object t) |
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
46 |
object PyTuple_New(Py_ssize_t n_entries) |
47 |
void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
48 |
Py_ssize_t PyTuple_GET_SIZE(object t) |
49 |
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index) |
|
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
50 |
void Py_INCREF(object) |
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
51 |
void Py_DECREF_ptr "Py_DECREF" (PyObject *) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
52 |
|
53 |
cdef extern from "string.h": |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
54 |
void *memcpy(void *dest, void *src, size_t n) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
55 |
void *memchr(void *s, int c, size_t n) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
56 |
# GNU extension
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
57 |
# void *memrchr(void *s, int c, size_t n)
|
58 |
int strncmp(char *s1, char *s2, size_t n) |
|
59 |
||
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
60 |
# It seems we need to import the definitions so that the pyrex compiler has
|
61 |
# local names to access them.
|
|
62 |
from _static_tuple_c cimport StaticTuple, \ |
|
63 |
import_static_tuple_c, StaticTuple_New, \ |
|
64 |
StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact |
|
65 |
||
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
66 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
67 |
# TODO: Find some way to import this from _dirstate_helpers
|
4634.117.10
by John Arbash Meinel
Change 'no except' to 'cannot_raise' |
68 |
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
69 |
# memrchr seems to be a GNU extension, so we have to implement it ourselves
|
70 |
# It is not present in any win32 standard library
|
|
71 |
cdef char *pos |
|
72 |
cdef char *start |
|
73 |
||
74 |
start = <char*>s |
|
75 |
pos = start + n - 1 |
|
76 |
while pos >= start: |
|
77 |
if pos[0] == c: |
|
78 |
return <void*>pos |
|
79 |
pos = pos - 1 |
|
80 |
return NULL |
|
81 |
||
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
82 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
83 |
# TODO: Import this from _dirstate_helpers when it is merged
|
84 |
cdef object safe_string_from_size(char *s, Py_ssize_t size): |
|
85 |
if size < 0: |
|
86 |
raise AssertionError( |
|
87 |
'tried to create a string with an invalid size: %d @0x%x' |
|
88 |
% (size, <int>s)) |
|
89 |
return PyString_FromStringAndSize(s, size) |
|
90 |
||
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
91 |
|
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
92 |
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size): |
93 |
cdef PyObject *py_str |
|
94 |
if size < 0: |
|
95 |
raise AssertionError( |
|
96 |
'tried to create a string with an invalid size: %d @0x%x' |
|
97 |
% (size, <int>s)) |
|
98 |
py_str = PyString_FromStringAndSize_ptr(s, size) |
|
99 |
PyString_InternInPlace(&py_str) |
|
100 |
result = <object>py_str |
|
101 |
# Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
|
|
102 |
# DECREF it to avoid geting immortal strings
|
|
103 |
Py_DECREF_ptr(py_str) |
|
104 |
return result |
|
105 |
||
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
106 |
from bzrlib import _static_tuple_c |
107 |
# This sets up the StaticTuple C_API functionality
|
|
108 |
import_static_tuple_c() |
|
109 |
||
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
110 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
111 |
cdef class BTreeLeafParser: |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
112 |
"""Parse the leaf nodes of a BTree index.
|
113 |
||
114 |
:ivar bytes: The PyString object containing the uncompressed text for the
|
|
115 |
node.
|
|
116 |
:ivar key_length: An integer describing how many pieces the keys have for
|
|
117 |
this index.
|
|
118 |
:ivar ref_list_length: An integer describing how many references this index
|
|
119 |
contains.
|
|
120 |
:ivar keys: A PyList of keys found in this node.
|
|
121 |
||
122 |
:ivar _cur_str: A pointer to the start of the next line to parse
|
|
123 |
:ivar _end_str: A pointer to the end of bytes
|
|
124 |
:ivar _start: Pointer to the location within the current line while
|
|
125 |
parsing.
|
|
126 |
:ivar _header_found: True when we have parsed the header for this node
|
|
127 |
"""
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
128 |
|
129 |
cdef object bytes |
|
130 |
cdef int key_length |
|
131 |
cdef int ref_list_length |
|
132 |
cdef object keys |
|
133 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
134 |
cdef char * _cur_str |
135 |
cdef char * _end_str |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
136 |
# The current start point for parsing
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
137 |
cdef char * _start |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
138 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
139 |
cdef int _header_found |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
140 |
|
141 |
def __init__(self, bytes, key_length, ref_list_length): |
|
142 |
self.bytes = bytes |
|
143 |
self.key_length = key_length |
|
144 |
self.ref_list_length = ref_list_length |
|
145 |
self.keys = [] |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
146 |
self._cur_str = NULL |
147 |
self._end_str = NULL |
|
148 |
self._header_found = 0 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
149 |
# keys are tuples
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
150 |
|
151 |
cdef extract_key(self, char * last): |
|
152 |
"""Extract a key.
|
|
153 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
154 |
:param last: points at the byte after the last byte permitted for the
|
155 |
key.
|
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
156 |
"""
|
157 |
cdef char *temp_ptr |
|
158 |
cdef int loop_counter |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
159 |
cdef StaticTuple key |
160 |
||
161 |
key = StaticTuple_New(self.key_length) |
|
4672.2.2
by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than |
162 |
for loop_counter from 0 <= loop_counter < self.key_length: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
163 |
# grab a key segment
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
164 |
temp_ptr = <char*>memchr(self._start, c'\0', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
165 |
if temp_ptr == NULL: |
4672.2.2
by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than |
166 |
if loop_counter + 1 == self.key_length: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
167 |
# capture to last
|
168 |
temp_ptr = last |
|
169 |
else: |
|
170 |
# Invalid line
|
|
171 |
failure_string = ("invalid key, wanted segment from " + |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
172 |
repr(safe_string_from_size(self._start, |
173 |
last - self._start))) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
174 |
raise AssertionError(failure_string) |
175 |
# capture the key string
|
|
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
176 |
if (self.key_length == 1 |
177 |
and (temp_ptr - self._start) == 45 |
|
178 |
and strncmp(self._start, 'sha1:', 5) == 0): |
|
179 |
key_element = safe_string_from_size(self._start, |
|
180 |
temp_ptr - self._start) |
|
181 |
else: |
|
182 |
key_element = safe_interned_string_from_size(self._start, |
|
4075.3.1
by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed. |
183 |
temp_ptr - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
184 |
# advance our pointer
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
185 |
self._start = temp_ptr + 1 |
4672.2.1
by John Arbash Meinel
Speed up the innermost btree parsing function. |
186 |
Py_INCREF(key_element) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
187 |
StaticTuple_SET_ITEM(key, loop_counter, key_element) |
188 |
key = StaticTuple_Intern(key) |
|
4679.3.48
by John Arbash Meinel
Prototype of not interning sha1: strings. |
189 |
return key |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
190 |
|
191 |
cdef int process_line(self) except -1: |
|
192 |
"""Process a line in the bytes."""
|
|
193 |
cdef char *last |
|
194 |
cdef char *temp_ptr |
|
195 |
cdef char *ref_ptr |
|
196 |
cdef char *next_start |
|
197 |
cdef int loop_counter |
|
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
198 |
cdef Py_ssize_t str_len |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
199 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
200 |
self._start = self._cur_str |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
201 |
# Find the next newline
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
202 |
last = <char*>memchr(self._start, c'\n', self._end_str - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
203 |
if last == NULL: |
204 |
# Process until the end of the file
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
205 |
last = self._end_str |
206 |
self._cur_str = self._end_str |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
207 |
else: |
208 |
# And the next string is right after it
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
209 |
self._cur_str = last + 1 |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
210 |
# The last character is right before the '\n'
|
211 |
||
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
212 |
if last == self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
213 |
# parsed it all.
|
214 |
return 0 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
215 |
if last < self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
216 |
# Unexpected error condition - fail
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
217 |
raise AssertionError("last < self._start") |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
218 |
if 0 == self._header_found: |
219 |
# The first line in a leaf node is the header "type=leaf\n"
|
|
220 |
if strncmp("type=leaf", self._start, last - self._start) == 0: |
|
221 |
self._header_found = 1 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
222 |
return 0 |
223 |
else: |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
224 |
raise AssertionError('Node did not start with "type=leaf": %r' |
225 |
% (safe_string_from_size(self._start, last - self._start))) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
226 |
|
227 |
key = self.extract_key(last) |
|
228 |
# find the value area
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
229 |
temp_ptr = <char*>_my_memrchr(self._start, c'\0', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
230 |
if temp_ptr == NULL: |
231 |
# Invalid line
|
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
232 |
raise AssertionError("Failed to find the value area") |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
233 |
else: |
4679.8.1
by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks. |
234 |
# Because of how conversions were done, we ended up with *lots* of
|
235 |
# values that are identical. These are all of the 0-length nodes
|
|
236 |
# that are referred to by the TREE_ROOT (and likely some other
|
|
237 |
# directory nodes.) For example, bzr has 25k references to
|
|
238 |
# something like '12607215 328306 0 0', which ends up consuming 1MB
|
|
239 |
# of memory, just for those strings.
|
|
240 |
str_len = last - temp_ptr - 1 |
|
241 |
if (str_len > 4 |
|
242 |
and strncmp(" 0 0", last - 4, 4) == 0): |
|
243 |
# This drops peak mem for bzr.dev from 87.4MB => 86.2MB
|
|
244 |
# For Launchpad 236MB => 232MB
|
|
245 |
value = safe_interned_string_from_size(temp_ptr + 1, str_len) |
|
246 |
else: |
|
247 |
value = safe_string_from_size(temp_ptr + 1, str_len) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
248 |
# shrink the references end point
|
249 |
last = temp_ptr |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
250 |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
251 |
if self.ref_list_length: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
252 |
ref_lists = StaticTuple_New(self.ref_list_length) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
253 |
loop_counter = 0 |
254 |
while loop_counter < self.ref_list_length: |
|
255 |
ref_list = [] |
|
256 |
# extract a reference list
|
|
257 |
loop_counter = loop_counter + 1 |
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
258 |
if last < self._start: |
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
259 |
raise AssertionError("last < self._start") |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
260 |
# find the next reference list end point:
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
261 |
temp_ptr = <char*>memchr(self._start, c'\t', last - self._start) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
262 |
if temp_ptr == NULL: |
263 |
# Only valid for the last list
|
|
264 |
if loop_counter != self.ref_list_length: |
|
265 |
# Invalid line
|
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
266 |
raise AssertionError( |
267 |
"invalid key, loop_counter != self.ref_list_length") |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
268 |
else: |
269 |
# scan to the end of the ref list area
|
|
270 |
ref_ptr = last |
|
271 |
next_start = last |
|
272 |
else: |
|
273 |
# scan to the end of this ref list
|
|
274 |
ref_ptr = temp_ptr |
|
275 |
next_start = temp_ptr + 1 |
|
276 |
# Now, there may be multiple keys in the ref list.
|
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
277 |
while self._start < ref_ptr: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
278 |
# loop finding keys and extracting them
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
279 |
temp_ptr = <char*>memchr(self._start, c'\r', |
280 |
ref_ptr - self._start) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
281 |
if temp_ptr == NULL: |
282 |
# key runs to the end
|
|
283 |
temp_ptr = ref_ptr |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
284 |
|
4679.3.28
by John Arbash Meinel
Add a KEY_HAS_HASH define. |
285 |
PyList_Append(ref_list, self.extract_key(temp_ptr)) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
286 |
ref_list = StaticTuple_Intern(StaticTuple(*ref_list)) |
287 |
Py_INCREF(ref_list) |
|
288 |
StaticTuple_SET_ITEM(ref_lists, loop_counter - 1, ref_list) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
289 |
# prepare for the next reference list
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
290 |
self._start = next_start |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
291 |
node_value = StaticTuple(value, ref_lists) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
292 |
else: |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
293 |
if last != self._start: |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
294 |
# unexpected reference data present
|
4634.60.1
by John Arbash Meinel
Don't return -1 directly, instead raise an exception. |
295 |
raise AssertionError("unexpected reference data present") |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
296 |
node_value = StaticTuple(value, StaticTuple()) |
297 |
PyList_Append(self.keys, StaticTuple(key, node_value)) |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
298 |
return 0 |
299 |
||
300 |
def parse(self): |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
301 |
cdef Py_ssize_t byte_count |
302 |
if not PyString_CheckExact(self.bytes): |
|
303 |
raise AssertionError('self.bytes is not a string.') |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
304 |
byte_count = PyString_Size(self.bytes) |
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
305 |
self._cur_str = PyString_AsString(self.bytes) |
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
306 |
# This points to the last character in the string
|
3641.3.2
by John Arbash Meinel
Clean up some variable names, add some documentation. |
307 |
self._end_str = self._cur_str + byte_count |
308 |
while self._cur_str < self._end_str: |
|
3641.3.1
by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests. |
309 |
self.process_line() |
310 |
return self.keys |
|
311 |
||
312 |
||
313 |
def _parse_leaf_lines(bytes, key_length, ref_list_length): |
|
314 |
parser = BTreeLeafParser(bytes, key_length, ref_list_length) |
|
315 |
return parser.parse() |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
316 |
|
317 |
||
318 |
def _flatten_node(node, reference_lists): |
|
319 |
"""Convert a node into the serialized form.
|
|
320 |
||
321 |
:param node: A tuple representing a node:
|
|
322 |
(index, key_tuple, value, references)
|
|
323 |
:param reference_lists: Does this index have reference lists?
|
|
324 |
:return: (string_key, flattened)
|
|
325 |
string_key The serialized key for referencing this node
|
|
326 |
flattened A string with the serialized form for the contents
|
|
327 |
"""
|
|
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
328 |
cdef int have_reference_lists |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
329 |
cdef Py_ssize_t flat_len |
330 |
cdef Py_ssize_t key_len |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
331 |
cdef Py_ssize_t node_len |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
332 |
cdef char * value |
333 |
cdef Py_ssize_t value_len |
|
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
334 |
cdef char * out |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
335 |
cdef Py_ssize_t refs_len |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
336 |
cdef Py_ssize_t next_len |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
337 |
cdef int first_ref_list |
3641.3.22
by John Arbash Meinel
flatten the next level |
338 |
cdef int first_reference |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
339 |
cdef int i |
340 |
cdef Py_ssize_t ref_bit_len |
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
341 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
342 |
if not PyTuple_CheckExact(node) and not StaticTuple_CheckExact(node): |
343 |
raise TypeError('We expected a tuple() or StaticTuple() for node not: %s' |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
344 |
% type(node)) |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
345 |
node_len = len(node) |
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
346 |
have_reference_lists = reference_lists |
347 |
if have_reference_lists: |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
348 |
if node_len != 4: |
349 |
raise ValueError('With ref_lists, we expected 4 entries not: %s' |
|
350 |
% len(node)) |
|
351 |
elif node_len < 3: |
|
352 |
raise ValueError('Without ref_lists, we need at least 3 entries not: %s' |
|
353 |
% len(node)) |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
354 |
# TODO: We can probably do better than string.join(), namely
|
355 |
# when key has only 1 item, we can just grab that string
|
|
356 |
# And when there are 2 items, we could do a single malloc + len() + 1
|
|
357 |
# also, doing .join() requires a PyObject_GetAttrString call, which
|
|
358 |
# we could also avoid.
|
|
4679.8.4
by John Arbash Meinel
Comment about compiled code quality. |
359 |
# TODO: Note that pyrex 0.9.6 generates fairly crummy code here, using the
|
360 |
# python object interface, versus 0.9.8+ which uses a helper that
|
|
361 |
# checks if this supports the sequence interface.
|
|
362 |
# We *could* do more work on our own, and grab the actual items
|
|
363 |
# lists. For now, just ask people to use a better compiler. :)
|
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
364 |
string_key = '\0'.join(node[1]) |
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
365 |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
366 |
# TODO: instead of using string joins, precompute the final string length,
|
367 |
# and then malloc a single string and copy everything in.
|
|
3641.3.19
by John Arbash Meinel
The flatten code now handles the no-ref-list case. |
368 |
|
369 |
# TODO: We probably want to use PySequenceFast, because we have lists and
|
|
370 |
# tuples, but we aren't sure which we will get.
|
|
371 |
||
372 |
# line := string_key NULL flat_refs NULL value LF
|
|
373 |
# string_key := BYTES (NULL BYTES)*
|
|
374 |
# flat_refs := ref_list (TAB ref_list)*
|
|
375 |
# ref_list := ref (CR ref)*
|
|
376 |
# ref := BYTES (NULL BYTES)*
|
|
377 |
# value := BYTES
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
378 |
refs_len = 0 |
3641.3.26
by John Arbash Meinel
A couple small tweaks. |
379 |
if have_reference_lists: |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
380 |
# Figure out how many bytes it will take to store the references
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
381 |
ref_lists = node[3] |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
382 |
next_len = len(ref_lists) # TODO: use a Py function |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
383 |
if next_len > 0: |
384 |
# If there are no nodes, we don't need to do any work
|
|
385 |
# Otherwise we will need (len - 1) '\t' characters to separate
|
|
386 |
# the reference lists
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
387 |
refs_len = refs_len + (next_len - 1) |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
388 |
for ref_list in ref_lists: |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
389 |
next_len = len(ref_list) |
390 |
if next_len > 0: |
|
391 |
# We will need (len - 1) '\r' characters to separate the
|
|
392 |
# references
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
393 |
refs_len = refs_len + (next_len - 1) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
394 |
for reference in ref_list: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
395 |
if (not PyTuple_CheckExact(reference) |
396 |
and not StaticTuple_CheckExact(reference)): |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
397 |
raise TypeError( |
398 |
'We expect references to be tuples not: %s' |
|
399 |
% type(reference)) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
400 |
next_len = len(reference) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
401 |
if next_len > 0: |
402 |
# We will need (len - 1) '\x00' characters to
|
|
403 |
# separate the reference key
|
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
404 |
refs_len = refs_len + (next_len - 1) |
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
405 |
for ref_bit in reference: |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
406 |
if not PyString_CheckExact(ref_bit): |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
407 |
raise TypeError('We expect reference bits' |
408 |
' to be strings not: %s' |
|
409 |
% type(<object>ref_bit)) |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
410 |
refs_len = refs_len + PyString_GET_SIZE(ref_bit) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
411 |
|
412 |
# So we have the (key NULL refs NULL value LF)
|
|
413 |
key_len = PyString_Size(string_key) |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
414 |
val = node[2] |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
415 |
if not PyString_CheckExact(val): |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
416 |
raise TypeError('Expected a plain str for value not: %s' |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
417 |
% type(val)) |
418 |
value = PyString_AS_STRING(val) |
|
419 |
value_len = PyString_GET_SIZE(val) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
420 |
flat_len = (key_len + 1 + refs_len + 1 + value_len + 1) |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
421 |
line = PyString_FromStringAndSize(NULL, flat_len) |
422 |
# Get a pointer to the new buffer
|
|
423 |
out = PyString_AsString(line) |
|
424 |
memcpy(out, PyString_AsString(string_key), key_len) |
|
425 |
out = out + key_len |
|
426 |
out[0] = c'\0' |
|
427 |
out = out + 1 |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
428 |
if refs_len > 0: |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
429 |
first_ref_list = 1 |
3641.3.24
by John Arbash Meinel
Use the compiled flatten function. |
430 |
for ref_list in ref_lists: |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
431 |
if first_ref_list == 0: |
432 |
out[0] = c'\t' |
|
433 |
out = out + 1 |
|
434 |
first_ref_list = 0 |
|
3641.3.22
by John Arbash Meinel
flatten the next level |
435 |
first_reference = 1 |
3641.3.21
by John Arbash Meinel
Flatten the outermost str.join() into memcpy's |
436 |
for reference in ref_list: |
3641.3.22
by John Arbash Meinel
flatten the next level |
437 |
if first_reference == 0: |
438 |
out[0] = c'\r' |
|
439 |
out = out + 1 |
|
440 |
first_reference = 0 |
|
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
441 |
next_len = len(reference) |
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
442 |
for i from 0 <= i < next_len: |
443 |
if i != 0: |
|
3641.3.23
by John Arbash Meinel
Flattened all the way down the stack. |
444 |
out[0] = c'\x00' |
445 |
out = out + 1 |
|
4679.8.2
by John Arbash Meinel
A bit of tweaking to _flatten_node. |
446 |
ref_bit = reference[i] |
4679.7.1
by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the |
447 |
ref_bit_len = PyString_GET_SIZE(ref_bit) |
448 |
memcpy(out, PyString_AS_STRING(ref_bit), ref_bit_len) |
|
3641.3.25
by John Arbash Meinel
Shave off some more time by using exact accessors. |
449 |
out = out + ref_bit_len |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
450 |
out[0] = c'\0' |
3641.3.22
by John Arbash Meinel
flatten the next level |
451 |
out = out + 1 |
3641.3.20
by John Arbash Meinel
We have a single malloc for the final output. |
452 |
memcpy(out, value, value_len) |
453 |
out = out + value_len |
|
454 |
out[0] = c'\n' |
|
3641.3.18
by John Arbash Meinel
Start working on a compiled function for transforming |
455 |
return string_key, line |