~bzr-pqm/bzr/bzr.dev

4763.2.4 by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry.
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.29 by John Arbash Meinel
Cleanup the copyright headers
16
#
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
17
18
"""Pyrex extensions to btree node parsing."""
19
3731.1.1 by Robert Collins
* The C extensions now build on python 2.4 (Robert Collins, #271939)
20
#python2.4 support
21
cdef extern from "python-compat.h":
22
    pass
23
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
24
cdef extern from "stdlib.h":
25
    ctypedef unsigned size_t
26
27
cdef extern from "Python.h":
3641.3.32 by John Arbash Meinel
PQM's pyrex version requires Py_ssize_t to be manually defined
28
    ctypedef int Py_ssize_t # Required for older pyrex versions
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
29
    ctypedef struct PyObject:
30
        pass
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
31
    int PyList_Append(object lst, object item) except -1
32
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
33
    char *PyString_AsString(object p) except NULL
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
34
    object PyString_FromStringAndSize(char *, Py_ssize_t)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
35
    PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
36
    object PyString_FromFormat(char *, ...)
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
37
    int PyString_CheckExact(object s)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
38
    int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
39
    Py_ssize_t PyString_Size(object p)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
40
    Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *)
41
    char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *)
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
42
    char * PyString_AS_STRING(object)
43
    Py_ssize_t PyString_GET_SIZE(object)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
44
    int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
45
    void PyString_InternInPlace(PyObject **)
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
46
    int PyTuple_CheckExact(object t)
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
47
    object PyTuple_New(Py_ssize_t n_entries)
48
    void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
49
    Py_ssize_t PyTuple_GET_SIZE(object t)
50
    PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
51
    void Py_INCREF(object)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
52
    void Py_DECREF_ptr "Py_DECREF" (PyObject *)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
53
    void *PyMem_Malloc(size_t nbytes)
54
    void PyMem_Free(void *)
55
    void memset(void *, int, size_t)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
56
57
cdef extern from "string.h":
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
58
    void *memcpy(void *dest, void *src, size_t n)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
59
    void *memchr(void *s, int c, size_t n)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
60
    int memcmp(void *s1, void *s2, size_t n)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
61
    # GNU extension
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
62
    # void *memrchr(void *s, int c, size_t n)
63
    int strncmp(char *s1, char *s2, size_t n)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
64
    unsigned long strtoul(char *s1, char **out, int base)
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
65
    long long strtoll(char *s1, char **out, int base)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
66
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
67
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
68
# It seems we need to import the definitions so that the pyrex compiler has
69
# local names to access them.
70
from _static_tuple_c cimport StaticTuple, \
71
    import_static_tuple_c, StaticTuple_New, \
5365.5.11 by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function.
72
    StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \
73
    StaticTuple_GET_SIZE, StaticTuple_GET_ITEM
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
74
# This tells the test infrastructure that StaticTuple is a class, so we don't
75
# have to worry about exception checking.
76
## extern cdef class StaticTuple
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
77
import sys
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
78
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
79
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
80
# TODO: Find some way to import this from _dirstate_helpers
4634.117.10 by John Arbash Meinel
Change 'no except' to 'cannot_raise'
81
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
82
    # memrchr seems to be a GNU extension, so we have to implement it ourselves
83
    # It is not present in any win32 standard library
84
    cdef char *pos
85
    cdef char *start
86
87
    start = <char*>s
88
    pos = start + n - 1
89
    while pos >= start:
90
        if pos[0] == c:
91
            return <void*>pos
92
        pos = pos - 1
93
    return NULL
94
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
95
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
96
# TODO: Import this from _dirstate_helpers when it is merged
97
cdef object safe_string_from_size(char *s, Py_ssize_t size):
98
    if size < 0:
99
        raise AssertionError(
100
            'tried to create a string with an invalid size: %d @0x%x'
101
            % (size, <int>s))
102
    return PyString_FromStringAndSize(s, size)
103
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
104
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
105
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
106
    cdef PyObject *py_str
107
    if size < 0:
108
        raise AssertionError(
109
            'tried to create a string with an invalid size: %d @0x%x'
110
            % (size, <int>s))
111
    py_str = PyString_FromStringAndSize_ptr(s, size)
112
    PyString_InternInPlace(&py_str)
113
    result = <object>py_str
114
    # Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
115
    # DECREF it to avoid geting immortal strings
116
    Py_DECREF_ptr(py_str)
117
    return result
118
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
119
# This sets up the StaticTuple C_API functionality
120
import_static_tuple_c()
121
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
122
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
123
cdef class BTreeLeafParser:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
124
    """Parse the leaf nodes of a BTree index.
125
126
    :ivar bytes: The PyString object containing the uncompressed text for the
127
        node.
128
    :ivar key_length: An integer describing how many pieces the keys have for
129
        this index.
130
    :ivar ref_list_length: An integer describing how many references this index
131
        contains.
132
    :ivar keys: A PyList of keys found in this node.
133
134
    :ivar _cur_str: A pointer to the start of the next line to parse
135
    :ivar _end_str: A pointer to the end of bytes
136
    :ivar _start: Pointer to the location within the current line while
137
        parsing.
138
    :ivar _header_found: True when we have parsed the header for this node
139
    """
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
140
141
    cdef object bytes
142
    cdef int key_length
143
    cdef int ref_list_length
144
    cdef object keys
145
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
146
    cdef char * _cur_str
147
    cdef char * _end_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
148
    # The current start point for parsing
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
149
    cdef char * _start
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
150
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
151
    cdef int _header_found
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
152
153
    def __init__(self, bytes, key_length, ref_list_length):
154
        self.bytes = bytes
155
        self.key_length = key_length
156
        self.ref_list_length = ref_list_length
157
        self.keys = []
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
158
        self._cur_str = NULL
159
        self._end_str = NULL
160
        self._header_found = 0
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
161
        # keys are tuples
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
162
163
    cdef extract_key(self, char * last):
164
        """Extract a key.
165
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
166
        :param last: points at the byte after the last byte permitted for the
167
            key.
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
168
        """
169
        cdef char *temp_ptr
170
        cdef int loop_counter
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
171
        cdef StaticTuple key
172
173
        key = StaticTuple_New(self.key_length)
4672.2.2 by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than
174
        for loop_counter from 0 <= loop_counter < self.key_length:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
175
            # grab a key segment
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
176
            temp_ptr = <char*>memchr(self._start, c'\0', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
177
            if temp_ptr == NULL:
4672.2.2 by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than
178
                if loop_counter + 1 == self.key_length:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
179
                    # capture to last
180
                    temp_ptr = last
181
                else:
182
                    # Invalid line
183
                    failure_string = ("invalid key, wanted segment from " +
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
184
                        repr(safe_string_from_size(self._start,
185
                                                   last - self._start)))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
186
                    raise AssertionError(failure_string)
187
            # capture the key string
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
188
            if (self.key_length == 1
189
                and (temp_ptr - self._start) == 45
190
                and strncmp(self._start, 'sha1:', 5) == 0):
191
                key_element = safe_string_from_size(self._start,
192
                                                    temp_ptr - self._start)
193
            else:
194
                key_element = safe_interned_string_from_size(self._start,
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
195
                                                         temp_ptr - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
196
            # advance our pointer
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
197
            self._start = temp_ptr + 1
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
198
            Py_INCREF(key_element)
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
199
            StaticTuple_SET_ITEM(key, loop_counter, key_element)
200
        key = StaticTuple_Intern(key)
4679.3.48 by John Arbash Meinel
Prototype of not interning sha1: strings.
201
        return key
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
202
203
    cdef int process_line(self) except -1:
204
        """Process a line in the bytes."""
205
        cdef char *last
206
        cdef char *temp_ptr
207
        cdef char *ref_ptr
208
        cdef char *next_start
209
        cdef int loop_counter
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
210
        cdef Py_ssize_t str_len
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
211
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
212
        self._start = self._cur_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
213
        # Find the next newline
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
214
        last = <char*>memchr(self._start, c'\n', self._end_str - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
215
        if last == NULL:
216
            # Process until the end of the file
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
217
            last = self._end_str
218
            self._cur_str = self._end_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
219
        else:
220
            # And the next string is right after it
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
221
            self._cur_str = last + 1
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
222
            # The last character is right before the '\n'
223
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
224
        if last == self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
225
            # parsed it all.
226
            return 0
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
227
        if last < self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
228
            # Unexpected error condition - fail
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
229
            raise AssertionError("last < self._start")
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
230
        if 0 == self._header_found:
231
            # The first line in a leaf node is the header "type=leaf\n"
232
            if strncmp("type=leaf", self._start, last - self._start) == 0:
233
                self._header_found = 1
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
234
                return 0
235
            else:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
236
                raise AssertionError('Node did not start with "type=leaf": %r'
237
                    % (safe_string_from_size(self._start, last - self._start)))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
238
239
        key = self.extract_key(last)
240
        # find the value area
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
241
        temp_ptr = <char*>_my_memrchr(self._start, c'\0', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
242
        if temp_ptr == NULL:
243
            # Invalid line
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
244
            raise AssertionError("Failed to find the value area")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
245
        else:
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
246
            # Because of how conversions were done, we ended up with *lots* of
247
            # values that are identical. These are all of the 0-length nodes
248
            # that are referred to by the TREE_ROOT (and likely some other
249
            # directory nodes.) For example, bzr has 25k references to
250
            # something like '12607215 328306 0 0', which ends up consuming 1MB
251
            # of memory, just for those strings.
252
            str_len = last - temp_ptr - 1
253
            if (str_len > 4
254
                and strncmp(" 0 0", last - 4, 4) == 0):
255
                # This drops peak mem for bzr.dev from 87.4MB => 86.2MB
256
                # For Launchpad 236MB => 232MB
257
                value = safe_interned_string_from_size(temp_ptr + 1, str_len)
258
            else:
259
                value = safe_string_from_size(temp_ptr + 1, str_len)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
260
            # shrink the references end point
261
            last = temp_ptr
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
262
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
263
        if self.ref_list_length:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
264
            ref_lists = StaticTuple_New(self.ref_list_length)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
265
            loop_counter = 0
266
            while loop_counter < self.ref_list_length:
267
                ref_list = []
268
                # extract a reference list
269
                loop_counter = loop_counter + 1
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
270
                if last < self._start:
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
271
                    raise AssertionError("last < self._start")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
272
                # find the next reference list end point:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
273
                temp_ptr = <char*>memchr(self._start, c'\t', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
274
                if temp_ptr == NULL:
275
                    # Only valid for the last list
276
                    if loop_counter != self.ref_list_length:
277
                        # Invalid line
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
278
                        raise AssertionError(
279
                            "invalid key, loop_counter != self.ref_list_length")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
280
                    else:
281
                        # scan to the end of the ref list area
282
                        ref_ptr = last
283
                        next_start = last
284
                else:
285
                    # scan to the end of this ref list
286
                    ref_ptr = temp_ptr
287
                    next_start = temp_ptr + 1
288
                # Now, there may be multiple keys in the ref list.
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
289
                while self._start < ref_ptr:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
290
                    # loop finding keys and extracting them
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
291
                    temp_ptr = <char*>memchr(self._start, c'\r',
292
                                             ref_ptr - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
293
                    if temp_ptr == NULL:
294
                        # key runs to the end
295
                        temp_ptr = ref_ptr
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
296
4679.3.28 by John Arbash Meinel
Add a KEY_HAS_HASH define.
297
                    PyList_Append(ref_list, self.extract_key(temp_ptr))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
298
                ref_list = StaticTuple_Intern(StaticTuple(*ref_list))
299
                Py_INCREF(ref_list)
300
                StaticTuple_SET_ITEM(ref_lists, loop_counter - 1, ref_list)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
301
                # prepare for the next reference list
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
302
                self._start = next_start
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
303
            node_value = StaticTuple(value, ref_lists)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
304
        else:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
305
            if last != self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
306
                # unexpected reference data present
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
307
                raise AssertionError("unexpected reference data present")
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
308
            node_value = StaticTuple(value, StaticTuple())
309
        PyList_Append(self.keys, StaticTuple(key, node_value))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
310
        return 0
311
312
    def parse(self):
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
313
        cdef Py_ssize_t byte_count
314
        if not PyString_CheckExact(self.bytes):
315
            raise AssertionError('self.bytes is not a string.')
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
316
        byte_count = PyString_Size(self.bytes)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
317
        self._cur_str = PyString_AsString(self.bytes)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
318
        # This points to the last character in the string
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
319
        self._end_str = self._cur_str + byte_count
320
        while self._cur_str < self._end_str:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
321
            self.process_line()
322
        return self.keys
323
324
325
def _parse_leaf_lines(bytes, key_length, ref_list_length):
326
    parser = BTreeLeafParser(bytes, key_length, ref_list_length)
327
    return parser.parse()
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
328
329
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
330
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
331
#       because the block_offset + length is likely to be repeated. However,
332
#       the big win there is to cache across pages, and not just one page
333
#       Though if we did cache in a page, we could certainly use a short int.
334
#       And this goes from 40 bytes to 30 bytes.
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
335
#       One slightly ugly option would be to cache block offsets in a global.
336
#       However, that leads to thread-safety issues, etc.
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
337
ctypedef struct gc_chk_sha1_record:
5365.5.8 by John Arbash Meinel
Add tests that we handle sizes close to and >2GB properly.
338
    long long block_offset
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
339
    unsigned int block_length
340
    unsigned int record_start
341
    unsigned int record_end
342
    char sha1[20]
343
344
345
cdef int _unhexbuf[256]
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
346
cdef char *_hexbuf
347
_hexbuf = '0123456789abcdef'
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
348
349
cdef _populate_unhexbuf():
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
350
    cdef int i
351
    for i from 0 <= i < 256:
352
        _unhexbuf[i] = -1
353
    for i from 0 <= i < 10: # 0123456789 => map to the raw number
354
        _unhexbuf[(i + c'0')] = i
355
    for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16
356
        _unhexbuf[(i - 10 + c'a')] = i
357
    for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16
358
        _unhexbuf[(i - 10 + c'A')] = i
359
_populate_unhexbuf()
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
360
361
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
362
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
363
    """Take the hex sha1 in as_hex and make it binary in as_bin
364
    
365
    Same as binascii.unhexlify, but working on C strings, not Python objects.
366
    """
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
367
    cdef int top
368
    cdef int bot
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
369
    cdef int i, j
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
370
    cdef char *cur
371
    
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
372
    # binascii does this using isupper() and tolower() and ?: syntax. I'm
373
    # guessing a simple lookup array should be faster.
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
374
    j = 0
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
375
    for i from 0 <= i < 20:
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
376
        top = _unhexbuf[<unsigned char>(as_hex[j])]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
377
        j = j + 1
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
378
        bot = _unhexbuf[<unsigned char>(as_hex[j])]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
379
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
380
        if top == -1 or bot == -1:
381
            return 0
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
382
        as_bin[i] = <unsigned char>((top << 4) + bot);
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
383
    return 1
384
385
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
386
def _py_unhexlify(as_hex):
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
387
    """For the test infrastructure, just thunks to _unhexlify_sha1"""
388
    if len(as_hex) != 40 or not PyString_CheckExact(as_hex):
389
        raise ValueError('not a 40-byte hex digest')
390
    as_bin = PyString_FromStringAndSize(NULL, 20)
391
    if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)):
392
        return as_bin
393
    return None
394
395
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
396
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
397
    cdef int i, j
398
    cdef char c
399
400
    j = 0
401
    for i from 0 <= i < 20:
402
        c = as_bin[i]
403
        as_hex[j] = _hexbuf[(c>>4)&0xf]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
404
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
405
        as_hex[j] = _hexbuf[(c)&0xf]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
406
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
407
408
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
409
def _py_hexlify(as_bin):
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
410
    """For test infrastructure, thunk to _hexlify_sha1"""
411
    if len(as_bin) != 20 or not PyString_CheckExact(as_bin):
412
        raise ValueError('not a 20-byte binary digest')
413
    as_hex = PyString_FromStringAndSize(NULL, 40)
414
    _hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex))
415
    return as_hex
416
417
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
418
cdef int _key_to_sha1(key, char *sha1): # cannot_raise
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
419
    """Map a key into its sha1 content.
420
421
    :param key: A tuple of style ('sha1:abcd...',)
422
    :param sha1: A char buffer of 20 bytes
423
    :return: 1 if this could be converted, 0 otherwise
424
    """
425
    cdef char *c_val
5365.5.11 by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function.
426
    cdef PyObject *p_val
427
428
    if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:
429
        p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)
430
    elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1):
431
        p_val = PyTuple_GET_ITEM_ptr_object(key, 0)
432
    else:
433
        # Not a tuple or a StaticTuple
434
        return 0
435
    if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45):
436
        c_val = PyString_AS_STRING_ptr(p_val)
437
    else:
438
        return 0
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
439
    if strncmp(c_val, 'sha1:', 5) != 0:
440
        return 0
441
    if not _unhexlify_sha1(c_val + 5, sha1):
442
        return 0
443
    return 1
444
445
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
446
def _py_key_to_sha1(key):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
447
    """Map a key to a simple sha1 string.
448
449
    This is a testing thunk to the C function.
450
    """
451
    as_bin_sha = PyString_FromStringAndSize(NULL, 20)
452
    if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):
453
        return as_bin_sha
454
    return None
455
456
457
cdef StaticTuple _sha1_to_key(char *sha1):
458
    """Compute a ('sha1:abcd',) key for a given sha1."""
459
    cdef StaticTuple key
460
    cdef object hexxed
461
    cdef char *c_buf
462
    hexxed = PyString_FromStringAndSize(NULL, 45)
463
    c_buf = PyString_AS_STRING(hexxed)
464
    memcpy(c_buf, 'sha1:', 5)
465
    _hexlify_sha1(sha1, c_buf+5)
466
    key = StaticTuple_New(1)
467
    Py_INCREF(hexxed)
468
    StaticTuple_SET_ITEM(key, 0, hexxed)
5365.5.7 by John Arbash Meinel
Document some perf decisions.
469
    # This is a bit expensive. To parse 120 keys takes 48us, to return them all
470
    # can be done in 66.6us (so 18.6us to build them all).
471
    # Adding simple hash() here brings it to 76.6us (so computing the hash
472
    # value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
473
    # them to the intern structure.)
474
    # However, since we only intern keys that are in active use, it is probably
475
    # a win. Since they would have been read from elsewhere anyway.
476
    # We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
477
    # that we have deserialized. Something to think about, at least.
5365.5.6 by John Arbash Meinel
For a single page test, we parse() still takes 47us
478
    key = StaticTuple_Intern(key)
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
479
    return key
480
481
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
482
def _py_sha1_to_key(sha1_bin):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
483
    """Test thunk to check the sha1 mapping."""
484
    if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:
485
        raise ValueError('sha1_bin must be a str of exactly 20 bytes')
486
    return _sha1_to_key(PyString_AS_STRING(sha1_bin))
487
488
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
489
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise
5365.5.13 by John Arbash Meinel
Populate the offsets array.
490
    cdef unsigned int val
491
    # Must be in MSB, because that is how the content is sorted
492
    val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
493
           | ((<unsigned int>(sha1[1]) & 0xff) << 16)
494
           | ((<unsigned int>(sha1[2]) & 0xff) << 8)
495
           | ((<unsigned int>(sha1[3]) & 0xff) << 0))
496
    return val
497
498
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
499
cdef _format_record_py24(gc_chk_sha1_record *record):
500
    """Python2.4 PyString_FromFormat doesn't have %u.
501
502
    It only has %d and %ld. We would really like to even have %llu, which
503
    is only in python2.7. So we go back into casting to regular objects.
504
    """
505
    return "%s %s %s %s" % (record.block_offset, record.block_length,
506
                            record.record_start, record.record_end)
507
508
509
cdef _format_record(gc_chk_sha1_record *record):
510
    # This is inefficient to go from a logical state back to a
511
    # string, but it makes things work a bit better internally for now.
512
    if record.block_offset >= 0xFFFFFFFF:
513
        # %llu is what we really want, but unfortunately it was only added
514
        # in python 2.7... :(
515
        block_offset_str = str(record.block_offset)
5689.1.1 by Martin
Use correct format character for unsigned int gc_chk_sha1_record members
516
        value = PyString_FromFormat('%s %u %u %u',
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
517
                                PyString_AS_STRING(block_offset_str),
518
                                record.block_length,
519
                                record.record_start, record.record_end)
520
    else:
5689.1.1 by Martin
Use correct format character for unsigned int gc_chk_sha1_record members
521
        value = PyString_FromFormat('%lu %u %u %u',
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
522
                                    <unsigned long>record.block_offset,
523
                                    record.block_length,
524
                                    record.record_start, record.record_end)
525
    return value
526
527
ctypedef object (*formatproc)(gc_chk_sha1_record *)
528
cdef formatproc _record_formatter
529
_record_formatter = _format_record
530
if sys.version_info[:2] == (2, 4):
531
    _record_formatter = _format_record_py24
532
533
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
534
cdef class GCCHKSHA1LeafNode:
535
    """Track all the entries for a given leaf node."""
536
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
537
    cdef gc_chk_sha1_record *records
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
538
    cdef public object last_key
539
    cdef gc_chk_sha1_record *last_record
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
540
    cdef public int num_records
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
541
    # This is the number of bits to shift to get to the interesting byte. A
542
    # value of 24 means that the very first byte changes across all keys.
543
    # Anything else means that there is a common prefix of bits that we can
544
    # ignore. 0 means that at least the first 3 bytes are identical, though
545
    # that is going to be very rare
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
546
    cdef public unsigned char common_shift
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
547
    # This maps an interesting byte to the first record that matches.
548
    # Equivalent to bisect.bisect_left(self.records, sha1), though only taking
549
    # into account that one byte.
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
550
    cdef unsigned char offsets[257]
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
551
552
    def __sizeof__(self):
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
553
        # :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)
554
        # like Cython? Explicitly enumerating everything here seems to leave my
555
        # size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an
556
        # alignment/padding issue. Oh well- at least we scale properly with
557
        # num_records and are very close to correct, which is what I care
558
        # about.
559
        # If we ever decide to require cython:
560
        # return (sizeof(GCCHKSHA1LeafNode)
561
        #     + sizeof(gc_chk_sha1_record)*self.num_records)
562
        return (sizeof(PyObject) + sizeof(void*) + sizeof(int)
563
            + sizeof(gc_chk_sha1_record*) + sizeof(PyObject *)
564
            + sizeof(gc_chk_sha1_record*) + sizeof(char)
565
            + sizeof(unsigned char)*257
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
566
            + sizeof(gc_chk_sha1_record)*self.num_records)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
567
568
    def __dealloc__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
569
        if self.records != NULL:
570
            PyMem_Free(self.records)
571
            self.records = NULL
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
572
573
    def __init__(self, bytes):
574
        self._parse_bytes(bytes)
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
575
        self.last_key = None
576
        self.last_record = NULL
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
577
578
    property min_key:
579
        def __get__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
580
            if self.num_records > 0:
581
                return _sha1_to_key(self.records[0].sha1)
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
582
            return None
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
583
584
    property max_key:
585
        def __get__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
586
            if self.num_records > 0:
587
                return _sha1_to_key(self.records[self.num_records-1].sha1)
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
588
            return None
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
589
590
    cdef StaticTuple _record_to_value_and_refs(self,
591
                                               gc_chk_sha1_record *record):
592
        """Extract the refs and value part of this record."""
593
        cdef StaticTuple value_and_refs
594
        cdef StaticTuple empty
595
        value_and_refs = StaticTuple_New(2)
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
596
        value = _record_formatter(record)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
597
        Py_INCREF(value)
598
        StaticTuple_SET_ITEM(value_and_refs, 0, value)
599
        # Always empty refs
600
        empty = StaticTuple_New(0)
601
        Py_INCREF(empty)
602
        StaticTuple_SET_ITEM(value_and_refs, 1, empty)
603
        return value_and_refs
604
605
    cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
606
        """Turn a given record back into a fully fledged item.
607
        """
608
        cdef StaticTuple item
609
        cdef StaticTuple key
610
        cdef StaticTuple value_and_refs
611
        cdef object value
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
612
        key = _sha1_to_key(record.sha1)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
613
        item = StaticTuple_New(2)
614
        Py_INCREF(key)
615
        StaticTuple_SET_ITEM(item, 0, key)
616
        value_and_refs = self._record_to_value_and_refs(record)
617
        Py_INCREF(value_and_refs)
618
        StaticTuple_SET_ITEM(item, 1, value_and_refs)
619
        return item
620
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
621
    cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
622
        """Find a gc_chk_sha1_record that matches the sha1 supplied."""
5365.5.21 by John Arbash Meinel
Remove counters, start the cleanup
623
        cdef int lo, hi, mid, the_cmp
5365.5.13 by John Arbash Meinel
Populate the offsets array.
624
        cdef int offset
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
625
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
626
        # TODO: We can speed up misses by comparing this sha1 to the common
627
        #       bits, and seeing if the common prefix matches, if not, we don't
628
        #       need to search for anything because it cannot match
629
        # Use the offset array to find the closest fit for this entry
630
        # follow that up with bisecting, since multiple keys can be in one
631
        # spot
632
        # Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
633
        # the offset array dropped us from 23us to 20us and 156 comparisions
634
        # (1.3/key)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
635
        offset = self._offset_for_sha1(sha1)
636
        lo = self.offsets[offset]
637
        hi = self.offsets[offset+1]
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
638
        if hi == 255:
639
            # if hi == 255 that means we potentially ran off the end of the
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
640
            # list, so push it up to num_records
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
641
            # note that if 'lo' == 255, that is ok, because we can start
642
            # searching from that part of the list.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
643
            hi = self.num_records
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
644
        local_n_cmp = 0
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
645
        while lo < hi:
646
            mid = (lo + hi) / 2
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
647
            the_cmp = memcmp(self.records[mid].sha1, sha1, 20)
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
648
            if the_cmp == 0:
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
649
                return &self.records[mid]
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
650
            elif the_cmp < 0:
5365.5.13 by John Arbash Meinel
Populate the offsets array.
651
                lo = mid + 1
652
            else:
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
653
                hi = mid
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
654
        return NULL
655
656
    def __contains__(self, key):
657
        cdef char sha1[20]
658
        cdef gc_chk_sha1_record *record
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
659
        if _key_to_sha1(key, sha1):
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
660
            # If it isn't a sha1 key, then it won't be in this leaf node
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
661
            record = self._lookup_record(sha1)
662
            if record != NULL:
663
                self.last_key = key
664
                self.last_record = record
665
                return True
666
        return False
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
667
668
    def __getitem__(self, key):
669
        cdef char sha1[20]
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
670
        cdef gc_chk_sha1_record *record
671
        record = NULL
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
672
        if self.last_record != NULL and key is self.last_key:
673
            record = self.last_record
674
        elif _key_to_sha1(key, sha1):
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
675
            record = self._lookup_record(sha1)
676
        if record == NULL:
677
            raise KeyError('key %r is not present' % (key,))
678
        return self._record_to_value_and_refs(record)
679
680
    def __len__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
681
        return self.num_records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
682
683
    def all_keys(self):
684
        cdef int i
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
685
        result = []
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
686
        for i from 0 <= i < self.num_records:
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
687
            PyList_Append(result, _sha1_to_key(self.records[i].sha1))
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
688
        return result
689
690
    def all_items(self):
691
        cdef int i
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
692
        result = []
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
693
        for i from 0 <= i < self.num_records:
694
            item = self._record_to_item(&self.records[i])
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
695
            PyList_Append(result, item)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
696
        return result
697
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
698
    cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
699
        """Count how many records are in this section."""
700
        cdef char *c_cur
701
        cdef int num_records
702
703
        c_cur = c_content
704
        num_records = 0
705
        while c_cur != NULL and c_cur < c_end:
706
            c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
707
            if c_cur == NULL:
708
                break
709
            c_cur = c_cur + 1
710
            num_records = num_records + 1
711
        return num_records
712
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
713
    cdef _parse_bytes(self, bytes):
714
        """Parse the string 'bytes' into content."""
715
        cdef char *c_bytes
716
        cdef char *c_cur
717
        cdef char *c_end
718
        cdef Py_ssize_t n_bytes
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
719
        cdef int num_records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
720
        cdef int entry
721
        cdef gc_chk_sha1_record *cur_record
722
723
        if not PyString_CheckExact(bytes):
724
            raise TypeError('We only support parsing plain 8-bit strings.')
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
725
        # Pass 1, count how many records there will be
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
726
        n_bytes = PyString_GET_SIZE(bytes)
727
        c_bytes = PyString_AS_STRING(bytes)
728
        c_end = c_bytes + n_bytes
729
        if strncmp(c_bytes, 'type=leaf\n', 10):
730
            raise ValueError("bytes did not start with 'type=leaf\\n': %r"
731
                             % (bytes[:10],))
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
732
        c_cur = c_bytes + 10
733
        num_records = self._count_records(c_cur, c_end)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
734
        # Now allocate the memory for these items, and go to town
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
735
        self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
736
            (sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
737
        self.num_records = num_records
738
        cur_record = self.records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
739
        entry = 0
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
740
        while c_cur != NULL and c_cur < c_end and entry < num_records:
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
741
            c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
742
            cur_record = cur_record + 1
743
            entry = entry + 1
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
744
        if (entry != self.num_records
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
745
            or c_cur != c_end
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
746
            or cur_record != self.records + self.num_records):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
747
            raise ValueError('Something went wrong while parsing.')
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
748
        # Pass 3: build the offset map
5365.5.13 by John Arbash Meinel
Populate the offsets array.
749
        self._compute_common()
750
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
751
    cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
752
                                gc_chk_sha1_record *cur_record) except NULL:
753
        """Read a single sha record from the bytes.
754
        :param c_cur: The pointer to the start of bytes
755
        :param cur_record: 
756
        """
757
        cdef char *c_next
758
        if strncmp(c_cur, 'sha1:', 5):
759
            raise ValueError('line did not start with sha1: %r'
760
                % (safe_string_from_size(c_cur, 10),))
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
761
        c_cur = c_cur + 5
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
762
        c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
763
        if c_next == NULL or (c_next - c_cur != 40):
764
            raise ValueError('Line did not contain 40 hex bytes')
765
        if not _unhexlify_sha1(c_cur, cur_record.sha1):
766
            raise ValueError('We failed to unhexlify')
767
        c_cur = c_next + 1
768
        if c_cur[0] != c'\0':
769
            raise ValueError('only 1 null, not 2 as expected')
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
770
        c_cur = c_cur + 1
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
771
        cur_record.block_offset = strtoll(c_cur, &c_next, 10)
772
        if c_cur == c_next or c_next[0] != c' ':
773
            raise ValueError('Failed to parse block offset')
774
        c_cur = c_next + 1
775
        cur_record.block_length = strtoul(c_cur, &c_next, 10)
776
        if c_cur == c_next or c_next[0] != c' ':
777
            raise ValueError('Failed to parse block length')
778
        c_cur = c_next + 1
779
        cur_record.record_start = strtoul(c_cur, &c_next, 10)
780
        if c_cur == c_next or c_next[0] != c' ':
781
            raise ValueError('Failed to parse block length')
782
        c_cur = c_next + 1
783
        cur_record.record_end = strtoul(c_cur, &c_next, 10)
784
        if c_cur == c_next or c_next[0] != c'\n':
785
            raise ValueError('Failed to parse record end')
786
        c_cur = c_next + 1
787
        return c_cur
788
5365.5.13 by John Arbash Meinel
Populate the offsets array.
789
    cdef int _offset_for_sha1(self, char *sha1) except -1:
790
        """Find the first interesting 8-bits of this sha1."""
791
        cdef int this_offset
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
792
        cdef unsigned int as_uint
793
        as_uint = _sha1_to_uint(sha1)
794
        this_offset = (as_uint >> self.common_shift) & 0xFF
5365.5.13 by John Arbash Meinel
Populate the offsets array.
795
        return this_offset
796
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
797
    def _get_offset_for_sha1(self, sha1):
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
798
        return self._offset_for_sha1(PyString_AS_STRING(sha1))
799
5365.5.13 by John Arbash Meinel
Populate the offsets array.
800
    cdef _compute_common(self):
801
        cdef unsigned int first
802
        cdef unsigned int this
803
        cdef unsigned int common_mask
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
804
        cdef unsigned char common_shift
5365.5.13 by John Arbash Meinel
Populate the offsets array.
805
        cdef int i
806
        cdef int offset, this_offset
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
807
        cdef int max_offset
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
808
        # The idea with the offset map is that we should be able to quickly
809
        # jump to the key that matches a gives sha1. We know that the keys are
810
        # in sorted order, and we know that a lot of the prefix is going to be
811
        # the same across them.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
812
        # By XORing the records together, we can determine what bits are set in
5365.5.13 by John Arbash Meinel
Populate the offsets array.
813
        # all of them
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
814
        if self.num_records < 2:
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
815
            # Everything is in common if you have 0 or 1 leaves
816
            # So we'll always just shift to the first byte
817
            self.common_shift = 24
5365.5.13 by John Arbash Meinel
Populate the offsets array.
818
        else:
819
            common_mask = 0xFFFFFFFF
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
820
            first = _sha1_to_uint(self.records[0].sha1)
821
            for i from 0 < i < self.num_records:
822
                this = _sha1_to_uint(self.records[i].sha1)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
823
                common_mask = (~(first ^ this)) & common_mask
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
824
            common_shift = 24
825
            while common_mask & 0x80000000 and common_shift > 0:
5365.5.13 by John Arbash Meinel
Populate the offsets array.
826
                common_mask = common_mask << 1
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
827
                common_shift = common_shift - 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
828
            self.common_shift = common_shift
829
        offset = 0
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
830
        max_offset = self.num_records
831
        # We cap this loop at 254 records. All the other offsets just get
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
832
        # filled with 0xff as the singleton saying 'too many'.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
833
        # It means that if we have >255 records we have to bisect the second
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
834
        # half of the list, but this is going to be very rare in practice.
835
        if max_offset > 255:
836
            max_offset = 255
837
        for i from 0 <= i < max_offset:
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
838
            this_offset = self._offset_for_sha1(self.records[i].sha1)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
839
            while offset <= this_offset:
840
                self.offsets[offset] = i
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
841
                offset = offset + 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
842
        while offset < 257:
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
843
            self.offsets[offset] = max_offset
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
844
            offset = offset + 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
845
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
846
    def _get_offsets(self):
847
        cdef int i
848
        result = []
849
        for i from 0 <= i < 257:
850
            PyList_Append(result, self.offsets[i])
851
        return result
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
852
853
854
def _parse_into_chk(bytes, key_length, ref_list_length):
855
    """Parse into a format optimized for chk records."""
856
    assert key_length == 1
857
    assert ref_list_length == 0
858
    return GCCHKSHA1LeafNode(bytes)
859
860
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
861
def _flatten_node(node, reference_lists):
862
    """Convert a node into the serialized form.
863
864
    :param node: A tuple representing a node:
865
        (index, key_tuple, value, references)
866
    :param reference_lists: Does this index have reference lists?
867
    :return: (string_key, flattened)
868
        string_key  The serialized key for referencing this node
869
        flattened   A string with the serialized form for the contents
870
    """
3641.3.26 by John Arbash Meinel
A couple small tweaks.
871
    cdef int have_reference_lists
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
872
    cdef Py_ssize_t flat_len
873
    cdef Py_ssize_t key_len
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
874
    cdef Py_ssize_t node_len
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
875
    cdef char * value
876
    cdef Py_ssize_t value_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
877
    cdef char * out
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
878
    cdef Py_ssize_t refs_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
879
    cdef Py_ssize_t next_len
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
880
    cdef int first_ref_list
3641.3.22 by John Arbash Meinel
flatten the next level
881
    cdef int first_reference
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
882
    cdef int i
883
    cdef Py_ssize_t ref_bit_len
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
884
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
885
    if not PyTuple_CheckExact(node) and not StaticTuple_CheckExact(node):
886
        raise TypeError('We expected a tuple() or StaticTuple() for node not: %s'
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
887
            % type(node))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
888
    node_len = len(node)
3641.3.26 by John Arbash Meinel
A couple small tweaks.
889
    have_reference_lists = reference_lists
890
    if have_reference_lists:
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
891
        if node_len != 4:
892
            raise ValueError('With ref_lists, we expected 4 entries not: %s'
893
                % len(node))
894
    elif node_len < 3:
895
        raise ValueError('Without ref_lists, we need at least 3 entries not: %s'
896
            % len(node))
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
897
    # TODO: We can probably do better than string.join(), namely
898
    #       when key has only 1 item, we can just grab that string
899
    #       And when there are 2 items, we could do a single malloc + len() + 1
900
    #       also, doing .join() requires a PyObject_GetAttrString call, which
901
    #       we could also avoid.
4679.8.4 by John Arbash Meinel
Comment about compiled code quality.
902
    # TODO: Note that pyrex 0.9.6 generates fairly crummy code here, using the
903
    #       python object interface, versus 0.9.8+ which uses a helper that
904
    #       checks if this supports the sequence interface.
905
    #       We *could* do more work on our own, and grab the actual items
906
    #       lists. For now, just ask people to use a better compiler. :)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
907
    string_key = '\0'.join(node[1])
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
908
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
909
    # TODO: instead of using string joins, precompute the final string length,
910
    #       and then malloc a single string and copy everything in.
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
911
912
    # TODO: We probably want to use PySequenceFast, because we have lists and
913
    #       tuples, but we aren't sure which we will get.
914
915
    # line := string_key NULL flat_refs NULL value LF
916
    # string_key := BYTES (NULL BYTES)*
917
    # flat_refs := ref_list (TAB ref_list)*
918
    # ref_list := ref (CR ref)*
919
    # ref := BYTES (NULL BYTES)*
920
    # value := BYTES
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
921
    refs_len = 0
3641.3.26 by John Arbash Meinel
A couple small tweaks.
922
    if have_reference_lists:
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
923
        # Figure out how many bytes it will take to store the references
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
924
        ref_lists = node[3]
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
925
        next_len = len(ref_lists) # TODO: use a Py function
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
926
        if next_len > 0:
927
            # If there are no nodes, we don't need to do any work
928
            # Otherwise we will need (len - 1) '\t' characters to separate
929
            # the reference lists
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
930
            refs_len = refs_len + (next_len - 1)
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
931
            for ref_list in ref_lists:
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
932
                next_len = len(ref_list)
933
                if next_len > 0:
934
                    # We will need (len - 1) '\r' characters to separate the
935
                    # references
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
936
                    refs_len = refs_len + (next_len - 1)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
937
                    for reference in ref_list:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
938
                        if (not PyTuple_CheckExact(reference)
939
                            and not StaticTuple_CheckExact(reference)):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
940
                            raise TypeError(
941
                                'We expect references to be tuples not: %s'
942
                                % type(reference))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
943
                        next_len = len(reference)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
944
                        if next_len > 0:
945
                            # We will need (len - 1) '\x00' characters to
946
                            # separate the reference key
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
947
                            refs_len = refs_len + (next_len - 1)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
948
                            for ref_bit in reference:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
949
                                if not PyString_CheckExact(ref_bit):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
950
                                    raise TypeError('We expect reference bits'
951
                                        ' to be strings not: %s'
952
                                        % type(<object>ref_bit))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
953
                                refs_len = refs_len + PyString_GET_SIZE(ref_bit)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
954
955
    # So we have the (key NULL refs NULL value LF)
956
    key_len = PyString_Size(string_key)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
957
    val = node[2]
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
958
    if not PyString_CheckExact(val):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
959
        raise TypeError('Expected a plain str for value not: %s'
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
960
                        % type(val))
961
    value = PyString_AS_STRING(val)
962
    value_len = PyString_GET_SIZE(val)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
963
    flat_len = (key_len + 1 + refs_len + 1 + value_len + 1)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
964
    line = PyString_FromStringAndSize(NULL, flat_len)
965
    # Get a pointer to the new buffer
966
    out = PyString_AsString(line)
967
    memcpy(out, PyString_AsString(string_key), key_len)
968
    out = out + key_len
969
    out[0] = c'\0'
970
    out = out + 1
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
971
    if refs_len > 0:
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
972
        first_ref_list = 1
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
973
        for ref_list in ref_lists:
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
974
            if first_ref_list == 0:
975
                out[0] = c'\t'
976
                out = out + 1
977
            first_ref_list = 0
3641.3.22 by John Arbash Meinel
flatten the next level
978
            first_reference = 1
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
979
            for reference in ref_list:
3641.3.22 by John Arbash Meinel
flatten the next level
980
                if first_reference == 0:
981
                    out[0] = c'\r'
982
                    out = out + 1
983
                first_reference = 0
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
984
                next_len = len(reference)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
985
                for i from 0 <= i < next_len:
986
                    if i != 0:
3641.3.23 by John Arbash Meinel
Flattened all the way down the stack.
987
                        out[0] = c'\x00'
988
                        out = out + 1
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
989
                    ref_bit = reference[i]
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
990
                    ref_bit_len = PyString_GET_SIZE(ref_bit)
991
                    memcpy(out, PyString_AS_STRING(ref_bit), ref_bit_len)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
992
                    out = out + ref_bit_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
993
    out[0] = c'\0'
3641.3.22 by John Arbash Meinel
flatten the next level
994
    out = out  + 1
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
995
    memcpy(out, value, value_len)
996
    out = out + value_len
997
    out[0] = c'\n'
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
998
    return string_key, line