~bzr-pqm/bzr/bzr.dev

4763.2.4 by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry.
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.29 by John Arbash Meinel
Cleanup the copyright headers
16
#
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
17
18
"""Pyrex extensions to btree node parsing."""
19
3731.1.1 by Robert Collins
* The C extensions now build on python 2.4 (Robert Collins, #271939)
20
#python2.4 support
21
cdef extern from "python-compat.h":
22
    pass
23
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
24
cdef extern from "stdlib.h":
25
    ctypedef unsigned size_t
26
27
cdef extern from "Python.h":
3641.3.32 by John Arbash Meinel
PQM's pyrex version requires Py_ssize_t to be manually defined
28
    ctypedef int Py_ssize_t # Required for older pyrex versions
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
29
    ctypedef struct PyObject:
30
        pass
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
31
    int PyList_Append(object lst, object item) except -1
32
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
33
    char *PyString_AsString(object p) except NULL
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
34
    object PyString_FromStringAndSize(char *, Py_ssize_t)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
35
    PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
36
    object PyString_FromFormat(char *, ...)
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
37
    int PyString_CheckExact(object s)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
38
    int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
39
    Py_ssize_t PyString_Size(object p)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
40
    Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *)
41
    char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *)
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
42
    char * PyString_AS_STRING(object)
43
    Py_ssize_t PyString_GET_SIZE(object)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
44
    int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
45
    void PyString_InternInPlace(PyObject **)
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
46
    int PyTuple_CheckExact(object t)
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
47
    object PyTuple_New(Py_ssize_t n_entries)
48
    void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
49
    Py_ssize_t PyTuple_GET_SIZE(object t)
50
    PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
51
    void Py_INCREF(object)
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
52
    void Py_DECREF_ptr "Py_DECREF" (PyObject *)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
53
    void *PyMem_Malloc(size_t nbytes)
54
    void PyMem_Free(void *)
55
    void memset(void *, int, size_t)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
56
57
cdef extern from "string.h":
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
58
    void *memcpy(void *dest, void *src, size_t n)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
59
    void *memchr(void *s, int c, size_t n)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
60
    int memcmp(void *s1, void *s2, size_t n)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
61
    # GNU extension
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
62
    # void *memrchr(void *s, int c, size_t n)
63
    int strncmp(char *s1, char *s2, size_t n)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
64
    unsigned long strtoul(char *s1, char **out, int base)
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
65
    long long strtoll(char *s1, char **out, int base)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
66
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
67
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
68
# It seems we need to import the definitions so that the pyrex compiler has
69
# local names to access them.
70
from _static_tuple_c cimport StaticTuple, \
71
    import_static_tuple_c, StaticTuple_New, \
5365.5.11 by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function.
72
    StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact, \
73
    StaticTuple_GET_SIZE, StaticTuple_GET_ITEM
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
74
# This tells the test infrastructure that StaticTuple is a class, so we don't
75
# have to worry about exception checking.
76
## extern cdef class StaticTuple
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
77
import sys
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
78
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
79
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
80
# TODO: Find some way to import this from _dirstate_helpers
4634.117.10 by John Arbash Meinel
Change 'no except' to 'cannot_raise'
81
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
82
    # memrchr seems to be a GNU extension, so we have to implement it ourselves
83
    # It is not present in any win32 standard library
84
    cdef char *pos
85
    cdef char *start
86
87
    start = <char*>s
88
    pos = start + n - 1
89
    while pos >= start:
90
        if pos[0] == c:
91
            return <void*>pos
92
        pos = pos - 1
93
    return NULL
94
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
95
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
96
# TODO: Import this from _dirstate_helpers when it is merged
97
cdef object safe_string_from_size(char *s, Py_ssize_t size):
98
    if size < 0:
99
        raise AssertionError(
100
            'tried to create a string with an invalid size: %d @0x%x'
101
            % (size, <int>s))
102
    return PyString_FromStringAndSize(s, size)
103
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
104
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
105
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
106
    cdef PyObject *py_str
107
    if size < 0:
108
        raise AssertionError(
109
            'tried to create a string with an invalid size: %d @0x%x'
110
            % (size, <int>s))
111
    py_str = PyString_FromStringAndSize_ptr(s, size)
112
    PyString_InternInPlace(&py_str)
113
    result = <object>py_str
114
    # Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
115
    # DECREF it to avoid geting immortal strings
116
    Py_DECREF_ptr(py_str)
117
    return result
118
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
119
# This sets up the StaticTuple C_API functionality
120
import_static_tuple_c()
121
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
122
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
123
cdef class BTreeLeafParser:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
124
    """Parse the leaf nodes of a BTree index.
125
126
    :ivar bytes: The PyString object containing the uncompressed text for the
127
        node.
128
    :ivar key_length: An integer describing how many pieces the keys have for
129
        this index.
130
    :ivar ref_list_length: An integer describing how many references this index
131
        contains.
132
    :ivar keys: A PyList of keys found in this node.
133
134
    :ivar _cur_str: A pointer to the start of the next line to parse
135
    :ivar _end_str: A pointer to the end of bytes
136
    :ivar _start: Pointer to the location within the current line while
137
        parsing.
138
    :ivar _header_found: True when we have parsed the header for this node
139
    """
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
140
141
    cdef object bytes
142
    cdef int key_length
143
    cdef int ref_list_length
144
    cdef object keys
145
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
146
    cdef char * _cur_str
147
    cdef char * _end_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
148
    # The current start point for parsing
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
149
    cdef char * _start
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
150
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
151
    cdef int _header_found
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
152
153
    def __init__(self, bytes, key_length, ref_list_length):
154
        self.bytes = bytes
155
        self.key_length = key_length
156
        self.ref_list_length = ref_list_length
157
        self.keys = []
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
158
        self._cur_str = NULL
159
        self._end_str = NULL
160
        self._header_found = 0
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
161
        # keys are tuples
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
162
163
    cdef extract_key(self, char * last):
164
        """Extract a key.
165
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
166
        :param last: points at the byte after the last byte permitted for the
167
            key.
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
168
        """
169
        cdef char *temp_ptr
170
        cdef int loop_counter
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
171
        cdef StaticTuple key
172
173
        key = StaticTuple_New(self.key_length)
4672.2.2 by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than
174
        for loop_counter from 0 <= loop_counter < self.key_length:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
175
            # grab a key segment
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
176
            temp_ptr = <char*>memchr(self._start, c'\0', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
177
            if temp_ptr == NULL:
4672.2.2 by John Arbash Meinel
Switch to using a for loop, and add only when checking last rather than
178
                if loop_counter + 1 == self.key_length:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
179
                    # capture to last
180
                    temp_ptr = last
181
                else:
182
                    # Invalid line
183
                    failure_string = ("invalid key, wanted segment from " +
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
184
                        repr(safe_string_from_size(self._start,
185
                                                   last - self._start)))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
186
                    raise AssertionError(failure_string)
187
            # capture the key string
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
188
            if (self.key_length == 1
189
                and (temp_ptr - self._start) == 45
190
                and strncmp(self._start, 'sha1:', 5) == 0):
191
                key_element = safe_string_from_size(self._start,
192
                                                    temp_ptr - self._start)
193
            else:
194
                key_element = safe_interned_string_from_size(self._start,
4075.3.1 by John Arbash Meinel
Use PyString_InternInPlace to intern() the various parts of keys that are processed.
195
                                                         temp_ptr - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
196
            # advance our pointer
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
197
            self._start = temp_ptr + 1
4672.2.1 by John Arbash Meinel
Speed up the innermost btree parsing function.
198
            Py_INCREF(key_element)
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
199
            StaticTuple_SET_ITEM(key, loop_counter, key_element)
200
        key = StaticTuple_Intern(key)
4679.3.48 by John Arbash Meinel
Prototype of not interning sha1: strings.
201
        return key
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
202
203
    cdef int process_line(self) except -1:
204
        """Process a line in the bytes."""
205
        cdef char *last
206
        cdef char *temp_ptr
207
        cdef char *ref_ptr
208
        cdef char *next_start
209
        cdef int loop_counter
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
210
        cdef Py_ssize_t str_len
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
211
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
212
        self._start = self._cur_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
213
        # Find the next newline
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
214
        last = <char*>memchr(self._start, c'\n', self._end_str - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
215
        if last == NULL:
216
            # Process until the end of the file
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
217
            last = self._end_str
218
            self._cur_str = self._end_str
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
219
        else:
220
            # And the next string is right after it
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
221
            self._cur_str = last + 1
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
222
            # The last character is right before the '\n'
223
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
224
        if last == self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
225
            # parsed it all.
226
            return 0
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
227
        if last < self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
228
            # Unexpected error condition - fail
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
229
            raise AssertionError("last < self._start")
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
230
        if 0 == self._header_found:
231
            # The first line in a leaf node is the header "type=leaf\n"
232
            if strncmp("type=leaf", self._start, last - self._start) == 0:
233
                self._header_found = 1
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
234
                return 0
235
            else:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
236
                raise AssertionError('Node did not start with "type=leaf": %r'
237
                    % (safe_string_from_size(self._start, last - self._start)))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
238
239
        key = self.extract_key(last)
240
        # find the value area
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
241
        temp_ptr = <char*>_my_memrchr(self._start, c'\0', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
242
        if temp_ptr == NULL:
243
            # Invalid line
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
244
            raise AssertionError("Failed to find the value area")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
245
        else:
4679.8.1 by John Arbash Meinel
Merge in the 2.1-static-tuple-btree branch, and restore the string intern tweaks.
246
            # Because of how conversions were done, we ended up with *lots* of
247
            # values that are identical. These are all of the 0-length nodes
248
            # that are referred to by the TREE_ROOT (and likely some other
249
            # directory nodes.) For example, bzr has 25k references to
250
            # something like '12607215 328306 0 0', which ends up consuming 1MB
251
            # of memory, just for those strings.
252
            str_len = last - temp_ptr - 1
253
            if (str_len > 4
254
                and strncmp(" 0 0", last - 4, 4) == 0):
255
                # This drops peak mem for bzr.dev from 87.4MB => 86.2MB
256
                # For Launchpad 236MB => 232MB
257
                value = safe_interned_string_from_size(temp_ptr + 1, str_len)
258
            else:
259
                value = safe_string_from_size(temp_ptr + 1, str_len)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
260
            # shrink the references end point
261
            last = temp_ptr
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
262
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
263
        if self.ref_list_length:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
264
            ref_lists = StaticTuple_New(self.ref_list_length)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
265
            loop_counter = 0
266
            while loop_counter < self.ref_list_length:
267
                ref_list = []
268
                # extract a reference list
269
                loop_counter = loop_counter + 1
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
270
                if last < self._start:
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
271
                    raise AssertionError("last < self._start")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
272
                # find the next reference list end point:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
273
                temp_ptr = <char*>memchr(self._start, c'\t', last - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
274
                if temp_ptr == NULL:
275
                    # Only valid for the last list
276
                    if loop_counter != self.ref_list_length:
277
                        # Invalid line
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
278
                        raise AssertionError(
279
                            "invalid key, loop_counter != self.ref_list_length")
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
280
                    else:
281
                        # scan to the end of the ref list area
282
                        ref_ptr = last
283
                        next_start = last
284
                else:
285
                    # scan to the end of this ref list
286
                    ref_ptr = temp_ptr
287
                    next_start = temp_ptr + 1
288
                # Now, there may be multiple keys in the ref list.
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
289
                while self._start < ref_ptr:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
290
                    # loop finding keys and extracting them
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
291
                    temp_ptr = <char*>memchr(self._start, c'\r',
292
                                             ref_ptr - self._start)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
293
                    if temp_ptr == NULL:
294
                        # key runs to the end
295
                        temp_ptr = ref_ptr
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
296
4679.3.28 by John Arbash Meinel
Add a KEY_HAS_HASH define.
297
                    PyList_Append(ref_list, self.extract_key(temp_ptr))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
298
                ref_list = StaticTuple_Intern(StaticTuple(*ref_list))
299
                Py_INCREF(ref_list)
300
                StaticTuple_SET_ITEM(ref_lists, loop_counter - 1, ref_list)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
301
                # prepare for the next reference list
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
302
                self._start = next_start
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
303
            node_value = StaticTuple(value, ref_lists)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
304
        else:
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
305
            if last != self._start:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
306
                # unexpected reference data present
4634.60.1 by John Arbash Meinel
Don't return -1 directly, instead raise an exception.
307
                raise AssertionError("unexpected reference data present")
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
308
            node_value = StaticTuple(value, StaticTuple())
309
        PyList_Append(self.keys, StaticTuple(key, node_value))
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
310
        return 0
311
312
    def parse(self):
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
313
        cdef Py_ssize_t byte_count
314
        if not PyString_CheckExact(self.bytes):
315
            raise AssertionError('self.bytes is not a string.')
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
316
        byte_count = PyString_Size(self.bytes)
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
317
        self._cur_str = PyString_AsString(self.bytes)
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
318
        # This points to the last character in the string
3641.3.2 by John Arbash Meinel
Clean up some variable names, add some documentation.
319
        self._end_str = self._cur_str + byte_count
320
        while self._cur_str < self._end_str:
3641.3.1 by John Arbash Meinel
Bring in the btree_index and chunk_writer code and their tests.
321
            self.process_line()
322
        return self.keys
323
324
325
def _parse_leaf_lines(bytes, key_length, ref_list_length):
326
    parser = BTreeLeafParser(bytes, key_length, ref_list_length)
327
    return parser.parse()
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
328
329
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
330
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
331
#       because the block_offset + length is likely to be repeated. However,
332
#       the big win there is to cache across pages, and not just one page
333
#       Though if we did cache in a page, we could certainly use a short int.
334
#       And this goes from 40 bytes to 30 bytes.
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
335
#       One slightly ugly option would be to cache block offsets in a global.
336
#       However, that leads to thread-safety issues, etc.
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
337
ctypedef struct gc_chk_sha1_record:
5365.5.8 by John Arbash Meinel
Add tests that we handle sizes close to and >2GB properly.
338
    long long block_offset
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
339
    unsigned int block_length
340
    unsigned int record_start
341
    unsigned int record_end
342
    char sha1[20]
343
344
345
cdef int _unhexbuf[256]
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
346
cdef char *_hexbuf
347
_hexbuf = '0123456789abcdef'
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
348
349
cdef _populate_unhexbuf():
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
350
    cdef int i
351
    for i from 0 <= i < 256:
352
        _unhexbuf[i] = -1
353
    for i from 0 <= i < 10: # 0123456789 => map to the raw number
354
        _unhexbuf[(i + c'0')] = i
355
    for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16
356
        _unhexbuf[(i - 10 + c'a')] = i
357
    for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16
358
        _unhexbuf[(i - 10 + c'A')] = i
359
_populate_unhexbuf()
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
360
361
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
362
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
363
    """Take the hex sha1 in as_hex and make it binary in as_bin
364
    
365
    Same as binascii.unhexlify, but working on C strings, not Python objects.
366
    """
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
367
    cdef int top
368
    cdef int bot
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
369
    cdef int i, j
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
370
    cdef char *cur
371
    
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
372
    # binascii does this using isupper() and tolower() and ?: syntax. I'm
373
    # guessing a simple lookup array should be faster.
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
374
    j = 0
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
375
    for i from 0 <= i < 20:
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
376
        top = _unhexbuf[<unsigned char>(as_hex[j])]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
377
        j = j + 1
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
378
        bot = _unhexbuf[<unsigned char>(as_hex[j])]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
379
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
380
        if top == -1 or bot == -1:
381
            return 0
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
382
        as_bin[i] = <unsigned char>((top << 4) + bot);
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
383
    return 1
384
385
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
386
def _py_unhexlify(as_hex):
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
387
    """For the test infrastructure, just thunks to _unhexlify_sha1"""
388
    if len(as_hex) != 40 or not PyString_CheckExact(as_hex):
389
        raise ValueError('not a 40-byte hex digest')
390
    as_bin = PyString_FromStringAndSize(NULL, 20)
391
    if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)):
392
        return as_bin
393
    return None
394
395
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
396
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
397
    cdef int i, j
398
    cdef char c
399
400
    j = 0
401
    for i from 0 <= i < 20:
402
        c = as_bin[i]
403
        as_hex[j] = _hexbuf[(c>>4)&0xf]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
404
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
405
        as_hex[j] = _hexbuf[(c)&0xf]
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
406
        j = j + 1
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
407
408
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
409
def _py_hexlify(as_bin):
5365.5.4 by John Arbash Meinel
Some direct tests of the hex and unhex functions.
410
    """For test infrastructure, thunk to _hexlify_sha1"""
411
    if len(as_bin) != 20 or not PyString_CheckExact(as_bin):
412
        raise ValueError('not a 20-byte binary digest')
413
    as_hex = PyString_FromStringAndSize(NULL, 40)
414
    _hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex))
415
    return as_hex
416
417
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
418
cdef int _key_to_sha1(key, char *sha1): # cannot_raise
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
419
    """Map a key into its sha1 content.
420
421
    :param key: A tuple of style ('sha1:abcd...',)
422
    :param sha1: A char buffer of 20 bytes
423
    :return: 1 if this could be converted, 0 otherwise
424
    """
425
    cdef char *c_val
5365.5.11 by John Arbash Meinel
get into the nitty gritty for the _key_to_sha1 function.
426
    cdef PyObject *p_val
427
428
    if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:
429
        p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)
430
    elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1):
431
        p_val = PyTuple_GET_ITEM_ptr_object(key, 0)
432
    else:
433
        # Not a tuple or a StaticTuple
434
        return 0
435
    if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45):
436
        c_val = PyString_AS_STRING_ptr(p_val)
437
    else:
438
        return 0
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
439
    if strncmp(c_val, 'sha1:', 5) != 0:
440
        return 0
441
    if not _unhexlify_sha1(c_val + 5, sha1):
442
        return 0
443
    return 1
444
445
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
446
def _py_key_to_sha1(key):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
447
    """Map a key to a simple sha1 string.
448
449
    This is a testing thunk to the C function.
450
    """
451
    as_bin_sha = PyString_FromStringAndSize(NULL, 20)
452
    if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):
453
        return as_bin_sha
454
    return None
455
456
457
cdef StaticTuple _sha1_to_key(char *sha1):
458
    """Compute a ('sha1:abcd',) key for a given sha1."""
459
    cdef StaticTuple key
460
    cdef object hexxed
461
    cdef char *c_buf
462
    hexxed = PyString_FromStringAndSize(NULL, 45)
463
    c_buf = PyString_AS_STRING(hexxed)
464
    memcpy(c_buf, 'sha1:', 5)
465
    _hexlify_sha1(sha1, c_buf+5)
466
    key = StaticTuple_New(1)
467
    Py_INCREF(hexxed)
468
    StaticTuple_SET_ITEM(key, 0, hexxed)
5365.5.7 by John Arbash Meinel
Document some perf decisions.
469
    # This is a bit expensive. To parse 120 keys takes 48us, to return them all
470
    # can be done in 66.6us (so 18.6us to build them all).
471
    # Adding simple hash() here brings it to 76.6us (so computing the hash
472
    # value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
473
    # them to the intern structure.)
474
    # However, since we only intern keys that are in active use, it is probably
475
    # a win. Since they would have been read from elsewhere anyway.
476
    # We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
477
    # that we have deserialized. Something to think about, at least.
5365.5.6 by John Arbash Meinel
For a single page test, we parse() still takes 47us
478
    key = StaticTuple_Intern(key)
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
479
    return key
480
481
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
482
def _py_sha1_to_key(sha1_bin):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
483
    """Test thunk to check the sha1 mapping."""
484
    if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:
485
        raise ValueError('sha1_bin must be a str of exactly 20 bytes')
486
    return _sha1_to_key(PyString_AS_STRING(sha1_bin))
487
488
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
489
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise
5365.5.13 by John Arbash Meinel
Populate the offsets array.
490
    cdef unsigned int val
491
    # Must be in MSB, because that is how the content is sorted
492
    val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
493
           | ((<unsigned int>(sha1[1]) & 0xff) << 16)
494
           | ((<unsigned int>(sha1[2]) & 0xff) << 8)
495
           | ((<unsigned int>(sha1[3]) & 0xff) << 0))
496
    return val
497
498
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
499
cdef _format_record(gc_chk_sha1_record *record):
500
    # This is inefficient to go from a logical state back to a
501
    # string, but it makes things work a bit better internally for now.
502
    if record.block_offset >= 0xFFFFFFFF:
503
        # %llu is what we really want, but unfortunately it was only added
504
        # in python 2.7... :(
505
        block_offset_str = str(record.block_offset)
5689.1.1 by Martin
Use correct format character for unsigned int gc_chk_sha1_record members
506
        value = PyString_FromFormat('%s %u %u %u',
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
507
                                PyString_AS_STRING(block_offset_str),
508
                                record.block_length,
509
                                record.record_start, record.record_end)
510
    else:
5689.1.1 by Martin
Use correct format character for unsigned int gc_chk_sha1_record members
511
        value = PyString_FromFormat('%lu %u %u %u',
5365.5.28 by John Arbash Meinel
Lots of compatibility changes for python2.4 and mingw32 compilers.
512
                                    <unsigned long>record.block_offset,
513
                                    record.block_length,
514
                                    record.record_start, record.record_end)
515
    return value
516
517
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
518
cdef class GCCHKSHA1LeafNode:
519
    """Track all the entries for a given leaf node."""
520
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
521
    cdef gc_chk_sha1_record *records
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
522
    cdef public object last_key
523
    cdef gc_chk_sha1_record *last_record
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
524
    cdef public int num_records
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
525
    # This is the number of bits to shift to get to the interesting byte. A
526
    # value of 24 means that the very first byte changes across all keys.
527
    # Anything else means that there is a common prefix of bits that we can
528
    # ignore. 0 means that at least the first 3 bytes are identical, though
529
    # that is going to be very rare
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
530
    cdef public unsigned char common_shift
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
531
    # This maps an interesting byte to the first record that matches.
532
    # Equivalent to bisect.bisect_left(self.records, sha1), though only taking
533
    # into account that one byte.
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
534
    cdef unsigned char offsets[257]
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
535
536
    def __sizeof__(self):
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
537
        # :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)
538
        # like Cython? Explicitly enumerating everything here seems to leave my
539
        # size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an
540
        # alignment/padding issue. Oh well- at least we scale properly with
541
        # num_records and are very close to correct, which is what I care
542
        # about.
543
        # If we ever decide to require cython:
544
        # return (sizeof(GCCHKSHA1LeafNode)
545
        #     + sizeof(gc_chk_sha1_record)*self.num_records)
546
        return (sizeof(PyObject) + sizeof(void*) + sizeof(int)
547
            + sizeof(gc_chk_sha1_record*) + sizeof(PyObject *)
548
            + sizeof(gc_chk_sha1_record*) + sizeof(char)
549
            + sizeof(unsigned char)*257
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
550
            + sizeof(gc_chk_sha1_record)*self.num_records)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
551
552
    def __dealloc__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
553
        if self.records != NULL:
554
            PyMem_Free(self.records)
555
            self.records = NULL
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
556
557
    def __init__(self, bytes):
558
        self._parse_bytes(bytes)
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
559
        self.last_key = None
560
        self.last_record = NULL
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
561
562
    property min_key:
563
        def __get__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
564
            if self.num_records > 0:
565
                return _sha1_to_key(self.records[0].sha1)
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
566
            return None
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
567
568
    property max_key:
569
        def __get__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
570
            if self.num_records > 0:
571
                return _sha1_to_key(self.records[self.num_records-1].sha1)
5365.5.2 by John Arbash Meinel
Parse times are dramatically improved.
572
            return None
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
573
574
    cdef StaticTuple _record_to_value_and_refs(self,
575
                                               gc_chk_sha1_record *record):
576
        """Extract the refs and value part of this record."""
577
        cdef StaticTuple value_and_refs
578
        cdef StaticTuple empty
579
        value_and_refs = StaticTuple_New(2)
5848.2.1 by John Arbash Meinel
Break compatibility with python <2.6.
580
        value = _format_record(record)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
581
        Py_INCREF(value)
582
        StaticTuple_SET_ITEM(value_and_refs, 0, value)
583
        # Always empty refs
584
        empty = StaticTuple_New(0)
585
        Py_INCREF(empty)
586
        StaticTuple_SET_ITEM(value_and_refs, 1, empty)
587
        return value_and_refs
588
589
    cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
590
        """Turn a given record back into a fully fledged item.
591
        """
592
        cdef StaticTuple item
593
        cdef StaticTuple key
594
        cdef StaticTuple value_and_refs
595
        cdef object value
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
596
        key = _sha1_to_key(record.sha1)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
597
        item = StaticTuple_New(2)
598
        Py_INCREF(key)
599
        StaticTuple_SET_ITEM(item, 0, key)
600
        value_and_refs = self._record_to_value_and_refs(record)
601
        Py_INCREF(value_and_refs)
602
        StaticTuple_SET_ITEM(item, 1, value_and_refs)
603
        return item
604
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
605
    cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
606
        """Find a gc_chk_sha1_record that matches the sha1 supplied."""
5365.5.21 by John Arbash Meinel
Remove counters, start the cleanup
607
        cdef int lo, hi, mid, the_cmp
5365.5.13 by John Arbash Meinel
Populate the offsets array.
608
        cdef int offset
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
609
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
610
        # TODO: We can speed up misses by comparing this sha1 to the common
611
        #       bits, and seeing if the common prefix matches, if not, we don't
612
        #       need to search for anything because it cannot match
613
        # Use the offset array to find the closest fit for this entry
614
        # follow that up with bisecting, since multiple keys can be in one
615
        # spot
616
        # Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
617
        # the offset array dropped us from 23us to 20us and 156 comparisions
618
        # (1.3/key)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
619
        offset = self._offset_for_sha1(sha1)
620
        lo = self.offsets[offset]
621
        hi = self.offsets[offset+1]
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
622
        if hi == 255:
623
            # if hi == 255 that means we potentially ran off the end of the
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
624
            # list, so push it up to num_records
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
625
            # note that if 'lo' == 255, that is ok, because we can start
626
            # searching from that part of the list.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
627
            hi = self.num_records
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
628
        local_n_cmp = 0
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
629
        while lo < hi:
630
            mid = (lo + hi) / 2
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
631
            the_cmp = memcmp(self.records[mid].sha1, sha1, 20)
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
632
            if the_cmp == 0:
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
633
                return &self.records[mid]
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
634
            elif the_cmp < 0:
5365.5.13 by John Arbash Meinel
Populate the offsets array.
635
                lo = mid + 1
636
            else:
5365.5.9 by John Arbash Meinel
Add some tests that key lookup works.
637
                hi = mid
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
638
        return NULL
639
640
    def __contains__(self, key):
641
        cdef char sha1[20]
642
        cdef gc_chk_sha1_record *record
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
643
        if _key_to_sha1(key, sha1):
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
644
            # If it isn't a sha1 key, then it won't be in this leaf node
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
645
            record = self._lookup_record(sha1)
646
            if record != NULL:
647
                self.last_key = key
648
                self.last_record = record
649
                return True
650
        return False
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
651
652
    def __getitem__(self, key):
653
        cdef char sha1[20]
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
654
        cdef gc_chk_sha1_record *record
655
        record = NULL
5365.5.19 by John Arbash Meinel
Lots of instrumentation. Add a last-lookup cache.
656
        if self.last_record != NULL and key is self.last_key:
657
            record = self.last_record
658
        elif _key_to_sha1(key, sha1):
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
659
            record = self._lookup_record(sha1)
660
        if record == NULL:
661
            raise KeyError('key %r is not present' % (key,))
662
        return self._record_to_value_and_refs(record)
663
664
    def __len__(self):
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
665
        return self.num_records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
666
667
    def all_keys(self):
668
        cdef int i
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
669
        result = []
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
670
        for i from 0 <= i < self.num_records:
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
671
            PyList_Append(result, _sha1_to_key(self.records[i].sha1))
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
672
        return result
673
674
    def all_items(self):
675
        cdef int i
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
676
        result = []
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
677
        for i from 0 <= i < self.num_records:
678
            item = self._record_to_item(&self.records[i])
5365.5.22 by John Arbash Meinel
Pyrexification. Rip out all the stuff that only Cython or new versions of pyrex understand.
679
            PyList_Append(result, item)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
680
        return result
681
5365.5.29 by John Arbash Meinel
Handle test_source and extensions. Also define an 'extern' protocol, to allow
682
    cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
683
        """Count how many records are in this section."""
684
        cdef char *c_cur
685
        cdef int num_records
686
687
        c_cur = c_content
688
        num_records = 0
689
        while c_cur != NULL and c_cur < c_end:
690
            c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
691
            if c_cur == NULL:
692
                break
693
            c_cur = c_cur + 1
694
            num_records = num_records + 1
695
        return num_records
696
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
697
    cdef _parse_bytes(self, bytes):
698
        """Parse the string 'bytes' into content."""
699
        cdef char *c_bytes
700
        cdef char *c_cur
701
        cdef char *c_end
702
        cdef Py_ssize_t n_bytes
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
703
        cdef int num_records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
704
        cdef int entry
705
        cdef gc_chk_sha1_record *cur_record
706
707
        if not PyString_CheckExact(bytes):
708
            raise TypeError('We only support parsing plain 8-bit strings.')
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
709
        # Pass 1, count how many records there will be
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
710
        n_bytes = PyString_GET_SIZE(bytes)
711
        c_bytes = PyString_AS_STRING(bytes)
712
        c_end = c_bytes + n_bytes
713
        if strncmp(c_bytes, 'type=leaf\n', 10):
714
            raise ValueError("bytes did not start with 'type=leaf\\n': %r"
715
                             % (bytes[:10],))
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
716
        c_cur = c_bytes + 10
717
        num_records = self._count_records(c_cur, c_end)
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
718
        # Now allocate the memory for these items, and go to town
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
719
        self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
720
            (sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
721
        self.num_records = num_records
722
        cur_record = self.records
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
723
        entry = 0
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
724
        while c_cur != NULL and c_cur < c_end and entry < num_records:
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
725
            c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
726
            cur_record = cur_record + 1
727
            entry = entry + 1
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
728
        if (entry != self.num_records
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
729
            or c_cur != c_end
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
730
            or cur_record != self.records + self.num_records):
5365.5.5 by John Arbash Meinel
Found some more bugs. I wasn't incrementing one of the pointers,
731
            raise ValueError('Something went wrong while parsing.')
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
732
        # Pass 3: build the offset map
5365.5.13 by John Arbash Meinel
Populate the offsets array.
733
        self._compute_common()
734
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
735
    cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
736
                                gc_chk_sha1_record *cur_record) except NULL:
737
        """Read a single sha record from the bytes.
738
        :param c_cur: The pointer to the start of bytes
739
        :param cur_record: 
740
        """
741
        cdef char *c_next
742
        if strncmp(c_cur, 'sha1:', 5):
743
            raise ValueError('line did not start with sha1: %r'
744
                % (safe_string_from_size(c_cur, 10),))
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
745
        c_cur = c_cur + 5
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
746
        c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
747
        if c_next == NULL or (c_next - c_cur != 40):
748
            raise ValueError('Line did not contain 40 hex bytes')
749
        if not _unhexlify_sha1(c_cur, cur_record.sha1):
750
            raise ValueError('We failed to unhexlify')
751
        c_cur = c_next + 1
752
        if c_cur[0] != c'\0':
753
            raise ValueError('only 1 null, not 2 as expected')
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
754
        c_cur = c_cur + 1
5365.5.16 by John Arbash Meinel
Trim the class a little bit.
755
        cur_record.block_offset = strtoll(c_cur, &c_next, 10)
756
        if c_cur == c_next or c_next[0] != c' ':
757
            raise ValueError('Failed to parse block offset')
758
        c_cur = c_next + 1
759
        cur_record.block_length = strtoul(c_cur, &c_next, 10)
760
        if c_cur == c_next or c_next[0] != c' ':
761
            raise ValueError('Failed to parse block length')
762
        c_cur = c_next + 1
763
        cur_record.record_start = strtoul(c_cur, &c_next, 10)
764
        if c_cur == c_next or c_next[0] != c' ':
765
            raise ValueError('Failed to parse block length')
766
        c_cur = c_next + 1
767
        cur_record.record_end = strtoul(c_cur, &c_next, 10)
768
        if c_cur == c_next or c_next[0] != c'\n':
769
            raise ValueError('Failed to parse record end')
770
        c_cur = c_next + 1
771
        return c_cur
772
5365.5.13 by John Arbash Meinel
Populate the offsets array.
773
    cdef int _offset_for_sha1(self, char *sha1) except -1:
774
        """Find the first interesting 8-bits of this sha1."""
775
        cdef int this_offset
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
776
        cdef unsigned int as_uint
777
        as_uint = _sha1_to_uint(sha1)
778
        this_offset = (as_uint >> self.common_shift) & 0xFF
5365.5.13 by John Arbash Meinel
Populate the offsets array.
779
        return this_offset
780
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
781
    def _get_offset_for_sha1(self, sha1):
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
782
        return self._offset_for_sha1(PyString_AS_STRING(sha1))
783
5365.5.13 by John Arbash Meinel
Populate the offsets array.
784
    cdef _compute_common(self):
785
        cdef unsigned int first
786
        cdef unsigned int this
787
        cdef unsigned int common_mask
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
788
        cdef unsigned char common_shift
5365.5.13 by John Arbash Meinel
Populate the offsets array.
789
        cdef int i
790
        cdef int offset, this_offset
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
791
        cdef int max_offset
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
792
        # The idea with the offset map is that we should be able to quickly
793
        # jump to the key that matches a gives sha1. We know that the keys are
794
        # in sorted order, and we know that a lot of the prefix is going to be
795
        # the same across them.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
796
        # By XORing the records together, we can determine what bits are set in
5365.5.13 by John Arbash Meinel
Populate the offsets array.
797
        # all of them
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
798
        if self.num_records < 2:
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
799
            # Everything is in common if you have 0 or 1 leaves
800
            # So we'll always just shift to the first byte
801
            self.common_shift = 24
5365.5.13 by John Arbash Meinel
Populate the offsets array.
802
        else:
803
            common_mask = 0xFFFFFFFF
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
804
            first = _sha1_to_uint(self.records[0].sha1)
805
            for i from 0 < i < self.num_records:
806
                this = _sha1_to_uint(self.records[i].sha1)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
807
                common_mask = (~(first ^ this)) & common_mask
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
808
            common_shift = 24
809
            while common_mask & 0x80000000 and common_shift > 0:
5365.5.13 by John Arbash Meinel
Populate the offsets array.
810
                common_mask = common_mask << 1
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
811
                common_shift = common_shift - 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
812
            self.common_shift = common_shift
813
        offset = 0
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
814
        max_offset = self.num_records
815
        # We cap this loop at 254 records. All the other offsets just get
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
816
        # filled with 0xff as the singleton saying 'too many'.
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
817
        # It means that if we have >255 records we have to bisect the second
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
818
        # half of the list, but this is going to be very rare in practice.
819
        if max_offset > 255:
820
            max_offset = 255
821
        for i from 0 <= i < max_offset:
5365.5.17 by John Arbash Meinel
rename 'entries' to 'records', some comment cleanups.
822
            this_offset = self._offset_for_sha1(self.records[i].sha1)
5365.5.13 by John Arbash Meinel
Populate the offsets array.
823
            while offset <= this_offset:
824
                self.offsets[offset] = i
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
825
                offset = offset + 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
826
        while offset < 257:
5365.5.15 by John Arbash Meinel
Handle the case where there are more than 255 entries.
827
            self.offsets[offset] = max_offset
5365.5.27 by John Arbash Meinel
switch 'x += 1' to 'x = x + 1' to deal with brain-damaged old versions of pyrex.
828
            offset = offset + 1
5365.5.13 by John Arbash Meinel
Populate the offsets array.
829
5365.5.24 by John Arbash Meinel
Change the _test names to _get and _py so that it doesn't provoke bad smells :)
830
    def _get_offsets(self):
831
        cdef int i
832
        result = []
833
        for i from 0 <= i < 257:
834
            PyList_Append(result, self.offsets[i])
835
        return result
5365.5.1 by John Arbash Meinel
Implement a custom parser for chk btree leaves.
836
837
838
def _parse_into_chk(bytes, key_length, ref_list_length):
839
    """Parse into a format optimized for chk records."""
840
    assert key_length == 1
841
    assert ref_list_length == 0
842
    return GCCHKSHA1LeafNode(bytes)
843
844
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
845
def _flatten_node(node, reference_lists):
846
    """Convert a node into the serialized form.
847
848
    :param node: A tuple representing a node:
849
        (index, key_tuple, value, references)
850
    :param reference_lists: Does this index have reference lists?
851
    :return: (string_key, flattened)
852
        string_key  The serialized key for referencing this node
853
        flattened   A string with the serialized form for the contents
854
    """
3641.3.26 by John Arbash Meinel
A couple small tweaks.
855
    cdef int have_reference_lists
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
856
    cdef Py_ssize_t flat_len
857
    cdef Py_ssize_t key_len
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
858
    cdef Py_ssize_t node_len
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
859
    cdef char * value
860
    cdef Py_ssize_t value_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
861
    cdef char * out
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
862
    cdef Py_ssize_t refs_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
863
    cdef Py_ssize_t next_len
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
864
    cdef int first_ref_list
3641.3.22 by John Arbash Meinel
flatten the next level
865
    cdef int first_reference
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
866
    cdef int i
867
    cdef Py_ssize_t ref_bit_len
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
868
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
869
    if not PyTuple_CheckExact(node) and not StaticTuple_CheckExact(node):
870
        raise TypeError('We expected a tuple() or StaticTuple() for node not: %s'
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
871
            % type(node))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
872
    node_len = len(node)
3641.3.26 by John Arbash Meinel
A couple small tweaks.
873
    have_reference_lists = reference_lists
874
    if have_reference_lists:
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
875
        if node_len != 4:
876
            raise ValueError('With ref_lists, we expected 4 entries not: %s'
877
                % len(node))
878
    elif node_len < 3:
879
        raise ValueError('Without ref_lists, we need at least 3 entries not: %s'
880
            % len(node))
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
881
    # TODO: We can probably do better than string.join(), namely
882
    #       when key has only 1 item, we can just grab that string
883
    #       And when there are 2 items, we could do a single malloc + len() + 1
884
    #       also, doing .join() requires a PyObject_GetAttrString call, which
885
    #       we could also avoid.
4679.8.4 by John Arbash Meinel
Comment about compiled code quality.
886
    # TODO: Note that pyrex 0.9.6 generates fairly crummy code here, using the
887
    #       python object interface, versus 0.9.8+ which uses a helper that
888
    #       checks if this supports the sequence interface.
889
    #       We *could* do more work on our own, and grab the actual items
890
    #       lists. For now, just ask people to use a better compiler. :)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
891
    string_key = '\0'.join(node[1])
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
892
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
893
    # TODO: instead of using string joins, precompute the final string length,
894
    #       and then malloc a single string and copy everything in.
3641.3.19 by John Arbash Meinel
The flatten code now handles the no-ref-list case.
895
896
    # TODO: We probably want to use PySequenceFast, because we have lists and
897
    #       tuples, but we aren't sure which we will get.
898
899
    # line := string_key NULL flat_refs NULL value LF
900
    # string_key := BYTES (NULL BYTES)*
901
    # flat_refs := ref_list (TAB ref_list)*
902
    # ref_list := ref (CR ref)*
903
    # ref := BYTES (NULL BYTES)*
904
    # value := BYTES
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
905
    refs_len = 0
3641.3.26 by John Arbash Meinel
A couple small tweaks.
906
    if have_reference_lists:
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
907
        # Figure out how many bytes it will take to store the references
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
908
        ref_lists = node[3]
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
909
        next_len = len(ref_lists) # TODO: use a Py function
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
910
        if next_len > 0:
911
            # If there are no nodes, we don't need to do any work
912
            # Otherwise we will need (len - 1) '\t' characters to separate
913
            # the reference lists
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
914
            refs_len = refs_len + (next_len - 1)
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
915
            for ref_list in ref_lists:
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
916
                next_len = len(ref_list)
917
                if next_len > 0:
918
                    # We will need (len - 1) '\r' characters to separate the
919
                    # references
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
920
                    refs_len = refs_len + (next_len - 1)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
921
                    for reference in ref_list:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
922
                        if (not PyTuple_CheckExact(reference)
923
                            and not StaticTuple_CheckExact(reference)):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
924
                            raise TypeError(
925
                                'We expect references to be tuples not: %s'
926
                                % type(reference))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
927
                        next_len = len(reference)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
928
                        if next_len > 0:
929
                            # We will need (len - 1) '\x00' characters to
930
                            # separate the reference key
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
931
                            refs_len = refs_len + (next_len - 1)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
932
                            for ref_bit in reference:
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
933
                                if not PyString_CheckExact(ref_bit):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
934
                                    raise TypeError('We expect reference bits'
935
                                        ' to be strings not: %s'
936
                                        % type(<object>ref_bit))
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
937
                                refs_len = refs_len + PyString_GET_SIZE(ref_bit)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
938
939
    # So we have the (key NULL refs NULL value LF)
940
    key_len = PyString_Size(string_key)
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
941
    val = node[2]
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
942
    if not PyString_CheckExact(val):
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
943
        raise TypeError('Expected a plain str for value not: %s'
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
944
                        % type(val))
945
    value = PyString_AS_STRING(val)
946
    value_len = PyString_GET_SIZE(val)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
947
    flat_len = (key_len + 1 + refs_len + 1 + value_len + 1)
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
948
    line = PyString_FromStringAndSize(NULL, flat_len)
949
    # Get a pointer to the new buffer
950
    out = PyString_AsString(line)
951
    memcpy(out, PyString_AsString(string_key), key_len)
952
    out = out + key_len
953
    out[0] = c'\0'
954
    out = out + 1
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
955
    if refs_len > 0:
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
956
        first_ref_list = 1
3641.3.24 by John Arbash Meinel
Use the compiled flatten function.
957
        for ref_list in ref_lists:
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
958
            if first_ref_list == 0:
959
                out[0] = c'\t'
960
                out = out + 1
961
            first_ref_list = 0
3641.3.22 by John Arbash Meinel
flatten the next level
962
            first_reference = 1
3641.3.21 by John Arbash Meinel
Flatten the outermost str.join() into memcpy's
963
            for reference in ref_list:
3641.3.22 by John Arbash Meinel
flatten the next level
964
                if first_reference == 0:
965
                    out[0] = c'\r'
966
                    out = out + 1
967
                first_reference = 0
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
968
                next_len = len(reference)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
969
                for i from 0 <= i < next_len:
970
                    if i != 0:
3641.3.23 by John Arbash Meinel
Flattened all the way down the stack.
971
                        out[0] = c'\x00'
972
                        out = out + 1
4679.8.2 by John Arbash Meinel
A bit of tweaking to _flatten_node.
973
                    ref_bit = reference[i]
4679.7.1 by John Arbash Meinel
Merge the 2.1-static-tuple-no-use branch, but restore the
974
                    ref_bit_len = PyString_GET_SIZE(ref_bit)
975
                    memcpy(out, PyString_AS_STRING(ref_bit), ref_bit_len)
3641.3.25 by John Arbash Meinel
Shave off some more time by using exact accessors.
976
                    out = out + ref_bit_len
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
977
    out[0] = c'\0'
3641.3.22 by John Arbash Meinel
flatten the next level
978
    out = out  + 1
3641.3.20 by John Arbash Meinel
We have a single malloc for the final output.
979
    memcpy(out, value, value_len)
980
    out = out + value_len
981
    out[0] = c'\n'
3641.3.18 by John Arbash Meinel
Start working on a compiled function for transforming
982
    return string_key, line