~bzr-pqm/bzr/bzr.dev

2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
1
# Copyright (C) 2007 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Pyrex extensions to knit parsing."""
18
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
19
import sys
2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
20
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
21
from bzrlib import errors
22
23
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
24
cdef extern from "stdlib.h":
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
25
    ctypedef unsigned size_t
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
26
    long int strtol(char *nptr, char **endptr, int base)
27
28
29
cdef extern from "Python.h":
30
    int PyDict_CheckExact(object)
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
31
    void *PyDict_GetItem_void "PyDict_GetItem" (object p, object key)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
32
    int PyDict_SetItem(object p, object key, object val) except -1
33
34
    int PyList_Append(object lst, object item) except -1
2484.1.22 by John Arbash Meinel
Remove unused function definitions.
35
    object PyList_GET_ITEM(object lst, int index)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
36
    int PyList_CheckExact(object)
37
38
    void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
39
40
    char *PyString_AsString(object p)
41
    object PyString_FromStringAndSize(char *, int)
42
    int PyString_Size(object p)
43
44
    void Py_INCREF(object)
45
46
47
cdef extern from "string.h":
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
48
    void *memchr(void *s, int c, size_t n)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
49
50
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
51
cdef int string_to_int_safe(char *s, char *end, int *out) except -1:
52
    """Convert a base10 string to an integer.
53
54
    This makes sure the whole string is consumed, or it raises ValueError.
55
    This is similar to how int(s) works, except you don't need a Python
56
    String object.
57
58
    :param s: The string to convert
59
    :param end: The character after the integer. So if the string is '12\0',
60
        this should be pointing at the '\0'. If the string was '12 ' then this
61
        should point at the ' '.
62
    :param out: This is the integer that will be returned
63
    :return: -1 if an exception is raised. 0 otherwise
64
    """
65
    cdef char *integer_end
66
67
    # We can't just return the integer because of how pyrex determines when
68
    # there is an exception.
2484.1.21 by John Arbash Meinel
Make sure we cast the long int back to a regular int
69
    out[0] = <int>strtol(s, &integer_end, 10)
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
70
    if integer_end != end:
71
        py_s = PyString_FromStringAndSize(s, end-s)
72
        raise ValueError('%r is not a valid integer' % (py_s,))
73
    return 0
74
75
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
76
cdef class KnitIndexReader:
77
78
    cdef object kndx
79
    cdef object fp
80
81
    cdef object cache
82
    cdef object history
83
84
    cdef char * cur_str
85
    cdef char * end_str
86
87
    cdef int history_len
88
2904.2.1 by John Arbash Meinel
Switch from __new__ to __init__ to avoid potential pyrex upgrade problems.
89
    def __init__(self, kndx, fp):
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
90
        self.kndx = kndx
91
        self.fp = fp
92
93
        self.cache = kndx._cache
94
        self.history = kndx._history
95
96
        self.cur_str = NULL
97
        self.end_str = NULL
98
        self.history_len = 0
99
100
    cdef void validate(self):
101
        if not PyDict_CheckExact(self.cache):
102
            raise TypeError('kndx._cache must be a python dict')
103
        if not PyList_CheckExact(self.history):
104
            raise TypeError('kndx._history must be a python list')
105
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
106
    cdef object process_options(self, char *option_str, char *end):
107
        """Process the options string into a list."""
108
        cdef char *next
109
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
110
        # This is alternative code which creates a python string and splits it.
111
        # It is "correct" and more obvious, but slower than the following code.
112
        # It can be uncommented to switch in case the other code is seen as
113
        # suspect.
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
114
        # options = PyString_FromStringAndSize(option_str,
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
115
        #                                      end - option_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
116
        # return options.split(',')
117
118
        final_options = []
119
120
        while option_str < end:
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
121
            next = <char*>memchr(option_str, c',', end - option_str)
122
            if next == NULL:
123
                next = end
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
124
            next_option = PyString_FromStringAndSize(option_str,
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
125
                                                     next - option_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
126
            PyList_Append(final_options, next_option)
2484.1.13 by John Arbash Meinel
Add a test that KnitCorrupt is raised when parent strings are invalid.
127
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
128
            # Move past the ','
129
            option_str = next+1
130
131
        return final_options
132
133
    cdef object process_parents(self, char *parent_str, char *end):
134
        cdef char *next
135
        cdef int int_parent
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
136
        cdef char *parent_end
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
137
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
138
        # Alternative, correct but slower code.
139
        #
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
140
        # parents = PyString_FromStringAndSize(parent_str,
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
141
        #                                      end - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
142
        # real_parents = []
143
        # for parent in parents.split():
144
        #     if parent[0].startswith('.'):
145
        #         real_parents.append(parent[1:])
146
        #     else:
147
        #         real_parents.append(self.history[int(parent)])
148
        # return real_parents
149
150
        parents = []
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
151
        while parent_str <= end:
152
            next = <char*>memchr(parent_str, c' ', end - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
153
            if next == NULL or next >= end or next == parent_str:
154
                break
155
156
            if parent_str[0] == c'.':
157
                # This is an explicit revision id
158
                parent_str = parent_str + 1
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
159
                parent = PyString_FromStringAndSize(parent_str,
160
                                                    next - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
161
            else:
162
                # This in an integer mapping to original
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
163
                string_to_int_safe(parent_str, next, &int_parent)
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
164
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
165
                if int_parent >= self.history_len:
166
                    raise IndexError('Parent index refers to a revision which'
167
                        ' does not exist yet.'
168
                        ' %d > %d' % (int_parent, self.history_len))
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
169
                parent = PyList_GET_ITEM(self.history, int_parent)
170
                # PyList_GET_ITEM steals a reference
171
                Py_INCREF(parent)
172
            PyList_Append(parents, parent)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
173
            parent_str = next + 1
174
        return parents
175
176
    cdef int process_one_record(self, char *start, char *end) except -1:
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
177
        """Take a simple string and split it into an index record."""
178
        cdef char *version_id_str
179
        cdef int version_id_size
180
        cdef char *option_str
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
181
        cdef char *option_end
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
182
        cdef char *pos_str
183
        cdef int pos
184
        cdef char *size_str
185
        cdef int size
186
        cdef char *parent_str
187
        cdef int parent_size
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
188
        cdef void *cache_entry
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
189
190
        version_id_str = start
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
191
        option_str = <char*>memchr(version_id_str, c' ', end - version_id_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
192
        if option_str == NULL or option_str >= end:
193
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
194
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
195
        version_id_size = <int>(option_str - version_id_str)
196
        # Move past the space character
197
        option_str = option_str + 1
198
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
199
        pos_str = <char*>memchr(option_str, c' ', end - option_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
200
        if pos_str == NULL or pos_str >= end:
201
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
202
            return 0
203
        option_end = pos_str
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
204
        pos_str = pos_str + 1
205
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
206
        size_str = <char*>memchr(pos_str, c' ', end - pos_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
207
        if size_str == NULL or size_str >= end:
208
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
209
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
210
        size_str = size_str + 1
211
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
212
        parent_str = <char*>memchr(size_str, c' ', end - size_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
213
        if parent_str == NULL or parent_str >= end:
214
            # Missing parents
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
215
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
216
        parent_str = parent_str + 1
217
218
        version_id = PyString_FromStringAndSize(version_id_str,
219
                                                version_id_size)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
220
        options = self.process_options(option_str, option_end)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
221
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
222
        try:
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
223
            string_to_int_safe(pos_str, size_str - 1, &pos)
224
            string_to_int_safe(size_str, parent_str - 1, &size)
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
225
            parents = self.process_parents(parent_str, end)
226
        except (ValueError, IndexError), e:
227
            py_line = PyString_FromStringAndSize(start, end - start)
228
            raise errors.KnitCorrupt(self.kndx._filename,
2484.1.13 by John Arbash Meinel
Add a test that KnitCorrupt is raised when parent strings are invalid.
229
                                     "line %r: %s" % (py_line, e))
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
230
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
231
        cache_entry = PyDict_GetItem_void(self.cache, version_id)
232
        if cache_entry == NULL:
233
            PyList_Append(self.history, version_id)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
234
            index = self.history_len
235
            self.history_len = self.history_len + 1
236
        else:
2484.1.22 by John Arbash Meinel
Remove unused function definitions.
237
            # PyTuple_GetItem_void_void does *not* increment the reference
238
            # counter, but casting to <object> does.
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
239
            index = <object>PyTuple_GetItem_void_void(cache_entry, 5)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
240
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
241
        PyDict_SetItem(self.cache, version_id,
242
                       (version_id,
243
                        options,
244
                        pos,
245
                        size,
246
                        parents,
247
                        index,
248
                       ))
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
249
        return 1
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
250
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
251
    cdef int process_next_record(self) except -1:
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
252
        """Process the next record in the file."""
253
        cdef char *last
254
        cdef char *start
255
256
        start = self.cur_str
257
        # Find the next newline
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
258
        last = <char*>memchr(start, c'\n', self.end_str - start)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
259
        if last == NULL:
260
            # Process until the end of the file
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
261
            last = self.end_str - 1
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
262
            self.cur_str = self.end_str
263
        else:
264
            # The last character is right before the '\n'
265
            # And the next string is right after it
266
            self.cur_str = last + 1
267
            last = last - 1
268
269
        if last <= start or last[0] != c':':
270
            # Incomplete record
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
271
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
272
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
273
        return self.process_one_record(start, last)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
274
275
    def read(self):
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
276
        cdef int text_size
277
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
278
        self.validate()
279
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
280
        self.kndx.check_header(self.fp)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
281
282
        # We read the whole thing at once
283
        # TODO: jam 2007-05-09 Consider reading incrementally rather than
284
        #       having to have the whole thing read up front.
285
        #       we already know that calling f.readlines() versus lots of
286
        #       f.readline() calls is faster.
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
287
        #       The other possibility is to avoid a Python String here
288
        #       completely. However self.fp may be a 'file-like' object
289
        #       it is not guaranteed to be a real file.
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
290
        text = self.fp.read()
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
291
        text_size = PyString_Size(text)
292
        self.cur_str = PyString_AsString(text)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
293
        # This points to the last character in the string
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
294
        self.end_str = self.cur_str + text_size
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
295
296
        while self.cur_str < self.end_str:
297
            self.process_next_record()
298
299
2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
300
def _load_data_c(kndx, fp):
301
    """Load the knit index file into memory."""
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
302
    reader = KnitIndexReader(kndx, fp)
303
    reader.read()