~bzr-pqm/bzr/bzr.dev

2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
1
# Copyright (C) 2007 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
16
17
"""Pyrex extensions to knit parsing."""
18
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
19
import sys
2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
20
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
21
from bzrlib import errors
22
23
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
24
cdef extern from "stdlib.h":
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
25
    ctypedef unsigned size_t
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
26
    long int strtol(char *nptr, char **endptr, int base)
27
28
29
cdef extern from "Python.h":
30
    int PyDict_CheckExact(object)
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
31
    void *PyDict_GetItem_void "PyDict_GetItem" (object p, object key)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
32
    int PyDict_SetItem(object p, object key, object val) except -1
33
34
    int PyList_Append(object lst, object item) except -1
2484.1.22 by John Arbash Meinel
Remove unused function definitions.
35
    object PyList_GET_ITEM(object lst, int index)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
36
    int PyList_CheckExact(object)
37
38
    void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
39
40
    char *PyString_AsString(object p)
41
    object PyString_FromStringAndSize(char *, int)
42
    int PyString_Size(object p)
43
44
    void Py_INCREF(object)
45
46
47
cdef extern from "string.h":
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
48
    void *memchr(void *s, int c, size_t n)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
49
50
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
51
cdef int string_to_int_safe(char *s, char *end, int *out) except -1:
52
    """Convert a base10 string to an integer.
53
54
    This makes sure the whole string is consumed, or it raises ValueError.
55
    This is similar to how int(s) works, except you don't need a Python
56
    String object.
57
58
    :param s: The string to convert
59
    :param end: The character after the integer. So if the string is '12\0',
60
        this should be pointing at the '\0'. If the string was '12 ' then this
61
        should point at the ' '.
62
    :param out: This is the integer that will be returned
63
    :return: -1 if an exception is raised. 0 otherwise
64
    """
65
    cdef char *integer_end
66
67
    # We can't just return the integer because of how pyrex determines when
68
    # there is an exception.
2484.1.21 by John Arbash Meinel
Make sure we cast the long int back to a regular int
69
    out[0] = <int>strtol(s, &integer_end, 10)
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
70
    if integer_end != end:
71
        py_s = PyString_FromStringAndSize(s, end-s)
72
        raise ValueError('%r is not a valid integer' % (py_s,))
73
    return 0
74
75
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
76
cdef class KnitIndexReader:
77
78
    cdef object kndx
79
    cdef object fp
80
81
    cdef object cache
82
    cdef object history
83
84
    cdef char * cur_str
85
    cdef char * end_str
86
87
    cdef int history_len
88
2904.2.1 by John Arbash Meinel
Switch from __new__ to __init__ to avoid potential pyrex upgrade problems.
89
    def __init__(self, kndx, fp):
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
90
        self.kndx = kndx
91
        self.fp = fp
92
93
        self.cache = kndx._cache
94
        self.history = kndx._history
95
96
        self.cur_str = NULL
97
        self.end_str = NULL
98
        self.history_len = 0
99
4634.117.6 by John Arbash Meinel
Start going through the test failures.
100
    cdef int validate(self) except -1:
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
101
        if not PyDict_CheckExact(self.cache):
102
            raise TypeError('kndx._cache must be a python dict')
103
        if not PyList_CheckExact(self.history):
104
            raise TypeError('kndx._history must be a python list')
4634.117.6 by John Arbash Meinel
Start going through the test failures.
105
        return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
106
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
107
    cdef object process_options(self, char *option_str, char *end):
108
        """Process the options string into a list."""
109
        cdef char *next
110
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
111
        # This is alternative code which creates a python string and splits it.
112
        # It is "correct" and more obvious, but slower than the following code.
113
        # It can be uncommented to switch in case the other code is seen as
114
        # suspect.
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
115
        # options = PyString_FromStringAndSize(option_str,
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
116
        #                                      end - option_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
117
        # return options.split(',')
118
119
        final_options = []
120
121
        while option_str < end:
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
122
            next = <char*>memchr(option_str, c',', end - option_str)
123
            if next == NULL:
124
                next = end
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
125
            next_option = PyString_FromStringAndSize(option_str,
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
126
                                                     next - option_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
127
            PyList_Append(final_options, next_option)
2484.1.13 by John Arbash Meinel
Add a test that KnitCorrupt is raised when parent strings are invalid.
128
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
129
            # Move past the ','
130
            option_str = next+1
131
132
        return final_options
133
134
    cdef object process_parents(self, char *parent_str, char *end):
135
        cdef char *next
136
        cdef int int_parent
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
137
        cdef char *parent_end
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
138
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
139
        # Alternative, correct but slower code.
140
        #
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
141
        # parents = PyString_FromStringAndSize(parent_str,
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
142
        #                                      end - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
143
        # real_parents = []
144
        # for parent in parents.split():
145
        #     if parent[0].startswith('.'):
146
        #         real_parents.append(parent[1:])
147
        #     else:
148
        #         real_parents.append(self.history[int(parent)])
149
        # return real_parents
150
151
        parents = []
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
152
        while parent_str <= end:
153
            next = <char*>memchr(parent_str, c' ', end - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
154
            if next == NULL or next >= end or next == parent_str:
155
                break
156
157
            if parent_str[0] == c'.':
158
                # This is an explicit revision id
159
                parent_str = parent_str + 1
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
160
                parent = PyString_FromStringAndSize(parent_str,
161
                                                    next - parent_str)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
162
            else:
163
                # This in an integer mapping to original
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
164
                string_to_int_safe(parent_str, next, &int_parent)
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
165
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
166
                if int_parent >= self.history_len:
167
                    raise IndexError('Parent index refers to a revision which'
168
                        ' does not exist yet.'
169
                        ' %d > %d' % (int_parent, self.history_len))
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
170
                parent = PyList_GET_ITEM(self.history, int_parent)
171
                # PyList_GET_ITEM steals a reference
172
                Py_INCREF(parent)
173
            PyList_Append(parents, parent)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
174
            parent_str = next + 1
3287.5.5 by Robert Collins
Refactor internals of knit implementations to implement get_parents_with_ghosts in terms of get_parent_map.
175
        return tuple(parents)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
176
177
    cdef int process_one_record(self, char *start, char *end) except -1:
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
178
        """Take a simple string and split it into an index record."""
179
        cdef char *version_id_str
180
        cdef int version_id_size
181
        cdef char *option_str
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
182
        cdef char *option_end
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
183
        cdef char *pos_str
184
        cdef int pos
185
        cdef char *size_str
186
        cdef int size
187
        cdef char *parent_str
188
        cdef int parent_size
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
189
        cdef void *cache_entry
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
190
191
        version_id_str = start
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
192
        option_str = <char*>memchr(version_id_str, c' ', end - version_id_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
193
        if option_str == NULL or option_str >= end:
194
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
195
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
196
        version_id_size = <int>(option_str - version_id_str)
197
        # Move past the space character
198
        option_str = option_str + 1
199
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
200
        pos_str = <char*>memchr(option_str, c' ', end - option_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
201
        if pos_str == NULL or pos_str >= end:
202
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
203
            return 0
204
        option_end = pos_str
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
205
        pos_str = pos_str + 1
206
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
207
        size_str = <char*>memchr(pos_str, c' ', end - pos_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
208
        if size_str == NULL or size_str >= end:
209
            # Short entry
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
210
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
211
        size_str = size_str + 1
212
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
213
        parent_str = <char*>memchr(size_str, c' ', end - size_str)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
214
        if parent_str == NULL or parent_str >= end:
215
            # Missing parents
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
216
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
217
        parent_str = parent_str + 1
218
219
        version_id = PyString_FromStringAndSize(version_id_str,
220
                                                version_id_size)
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
221
        options = self.process_options(option_str, option_end)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
222
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
223
        try:
2484.1.18 by John Arbash Meinel
Test that we properly verify the size and position strings.
224
            string_to_int_safe(pos_str, size_str - 1, &pos)
225
            string_to_int_safe(size_str, parent_str - 1, &size)
2484.1.11 by John Arbash Meinel
[merge] bzr.dev 2563 and add integer checking to the extension.
226
            parents = self.process_parents(parent_str, end)
227
        except (ValueError, IndexError), e:
228
            py_line = PyString_FromStringAndSize(start, end - start)
229
            raise errors.KnitCorrupt(self.kndx._filename,
2484.1.13 by John Arbash Meinel
Add a test that KnitCorrupt is raised when parent strings are invalid.
230
                                     "line %r: %s" % (py_line, e))
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
231
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
232
        cache_entry = PyDict_GetItem_void(self.cache, version_id)
233
        if cache_entry == NULL:
234
            PyList_Append(self.history, version_id)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
235
            index = self.history_len
236
            self.history_len = self.history_len + 1
237
        else:
2484.1.22 by John Arbash Meinel
Remove unused function definitions.
238
            # PyTuple_GetItem_void_void does *not* increment the reference
239
            # counter, but casting to <object> does.
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
240
            index = <object>PyTuple_GetItem_void_void(cache_entry, 5)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
241
2484.1.6 by John Arbash Meinel
Use direct functions when possible, and avoid extra dict lookups.
242
        PyDict_SetItem(self.cache, version_id,
243
                       (version_id,
244
                        options,
245
                        pos,
246
                        size,
247
                        parents,
248
                        index,
249
                       ))
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
250
        return 1
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
251
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
252
    cdef int process_next_record(self) except -1:
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
253
        """Process the next record in the file."""
254
        cdef char *last
255
        cdef char *start
256
257
        start = self.cur_str
258
        # Find the next newline
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
259
        last = <char*>memchr(start, c'\n', self.end_str - start)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
260
        if last == NULL:
261
            # Process until the end of the file
2484.1.16 by John Arbash Meinel
Switch from strchr to memchr to ensure we never search all of memory.
262
            last = self.end_str - 1
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
263
            self.cur_str = self.end_str
264
        else:
265
            # The last character is right before the '\n'
266
            # And the next string is right after it
267
            self.cur_str = last + 1
268
            last = last - 1
269
270
        if last <= start or last[0] != c':':
271
            # Incomplete record
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
272
            return 0
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
273
2484.1.5 by John Arbash Meinel
Simplistic implementations of custom parsers for options and parents
274
        return self.process_one_record(start, last)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
275
276
    def read(self):
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
277
        cdef int text_size
278
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
279
        self.validate()
280
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
281
        self.kndx.check_header(self.fp)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
282
283
        # We read the whole thing at once
284
        # TODO: jam 2007-05-09 Consider reading incrementally rather than
285
        #       having to have the whole thing read up front.
286
        #       we already know that calling f.readlines() versus lots of
287
        #       f.readline() calls is faster.
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
288
        #       The other possibility is to avoid a Python String here
289
        #       completely. However self.fp may be a 'file-like' object
290
        #       it is not guaranteed to be a real file.
2484.1.10 by John Arbash Meinel
Revert previous change since it doesn't help
291
        text = self.fp.read()
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
292
        text_size = PyString_Size(text)
293
        self.cur_str = PyString_AsString(text)
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
294
        # This points to the last character in the string
2484.1.8 by John Arbash Meinel
A bit of cleanup. remove unused variables.
295
        self.end_str = self.cur_str + text_size
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
296
297
        while self.cur_str < self.end_str:
298
            self.process_next_record()
299
300
2484.1.1 by John Arbash Meinel
Add an initial function to read knit indexes in pyrex.
301
def _load_data_c(kndx, fp):
302
    """Load the knit index file into memory."""
2484.1.4 by John Arbash Meinel
First step towards custom parsing.
303
    reader = KnitIndexReader(kndx, fp)
304
    reader.read()