4354.3.1
by Jelmer Vernooij
Move core RIO parsing functionality to _rio_py.py. |
1 |
# Copyright (C) 2009 Canonical Ltd
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
16 |
||
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
17 |
"""Pyrex implementation of _read_stanza_*."""
|
18 |
||
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
19 |
#python2.4 support
|
20 |
cdef extern from "python-compat.h": |
|
21 |
pass
|
|
22 |
||
4368.1.1
by Jelmer Vernooij
Import malloc and friends from stdlib.h rather than from (deprecated) malloc.h. |
23 |
cdef extern from "stdlib.h": |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
24 |
void *malloc(int) |
25 |
void *realloc(void *, int) |
|
26 |
void free(void *) |
|
27 |
||
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
28 |
cdef extern from "Python.h": |
29 |
ctypedef int Py_ssize_t # Required for older pyrex versions |
|
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
30 |
ctypedef int Py_UNICODE |
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
31 |
char *PyString_AS_STRING(object s) |
4354.3.4
by Jelmer Vernooij
More work using C API's rather than Python objects. |
32 |
Py_ssize_t PyString_GET_SIZE(object t) except -1 |
33 |
object PyUnicode_DecodeUTF8(char *string, Py_ssize_t length, char *errors) |
|
34 |
object PyString_FromStringAndSize(char *s, Py_ssize_t len) |
|
35 |
int PyString_CheckExact(object) |
|
36 |
int PyUnicode_CheckExact(object) |
|
37 |
object PyUnicode_Join(object, object) |
|
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
38 |
object PyUnicode_EncodeASCII(Py_UNICODE *, int, char *) |
39 |
Py_UNICODE *PyUnicode_AS_UNICODE(object) |
|
40 |
Py_UNICODE *PyUnicode_AsUnicode(object) |
|
41 |
Py_ssize_t PyUnicode_GET_SIZE(object) except -1 |
|
4354.3.9
by Jelmer Vernooij
Use PyList_Append. |
42 |
int PyList_Append(object, object) except -1 |
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
43 |
int Py_UNICODE_ISLINEBREAK(Py_UNICODE) |
44 |
object PyUnicode_FromUnicode(Py_UNICODE *, int) |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
45 |
void *Py_UNICODE_COPY(Py_UNICODE *, Py_UNICODE *, int) |
4354.3.4
by Jelmer Vernooij
More work using C API's rather than Python objects. |
46 |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
47 |
cdef extern from "string.h": |
48 |
void *memcpy(void *, void *, int) |
|
49 |
||
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
50 |
from bzrlib.rio import Stanza |
51 |
||
4634.117.10
by John Arbash Meinel
Change 'no except' to 'cannot_raise' |
52 |
cdef int _valid_tag_char(char c): # cannot_raise |
4354.3.8
by Jelmer Vernooij
Review feedback from John: |
53 |
return (c == c'_' or c == c'-' or |
54 |
(c >= c'a' and c <= c'z') or |
|
55 |
(c >= c'A' and c <= c'Z') or |
|
56 |
(c >= c'0' and c <= c'9')) |
|
57 |
||
58 |
||
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
59 |
def _valid_tag(tag): |
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
60 |
cdef char *c_tag |
4354.3.4
by Jelmer Vernooij
More work using C API's rather than Python objects. |
61 |
cdef Py_ssize_t c_len |
62 |
cdef int i |
|
4354.3.7
by Jelmer Vernooij
Simplify unicode stanza reading, check for Type in valid_tag. |
63 |
if not PyString_CheckExact(tag): |
64 |
raise TypeError(tag) |
|
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
65 |
c_tag = PyString_AS_STRING(tag) |
66 |
c_len = PyString_GET_SIZE(tag) |
|
4354.3.12
by Jelmer Vernooij
Add tests for _valid_tag. |
67 |
if c_len < 1: |
68 |
return False |
|
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
69 |
for i from 0 <= i < c_len: |
4354.3.8
by Jelmer Vernooij
Review feedback from John: |
70 |
if not _valid_tag_char(c_tag[i]): |
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
71 |
return False |
72 |
return True |
|
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
73 |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
74 |
|
75 |
cdef object _split_first_line_utf8(char *line, int len, |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
76 |
char *value, Py_ssize_t *value_len): |
4354.3.8
by Jelmer Vernooij
Review feedback from John: |
77 |
cdef int i |
78 |
for i from 0 <= i < len: |
|
79 |
if line[i] == c':': |
|
80 |
if line[i+1] != c' ': |
|
81 |
raise ValueError("invalid tag in line %r" % line) |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
82 |
memcpy(value, line+i+2, len-i-2) |
83 |
value_len[0] = len-i-2 |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
84 |
return PyString_FromStringAndSize(line, i) |
4354.3.8
by Jelmer Vernooij
Review feedback from John: |
85 |
raise ValueError('tag/value separator not found in line %r' % line) |
86 |
||
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
87 |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
88 |
cdef object _split_first_line_unicode(Py_UNICODE *line, int len, |
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
89 |
Py_UNICODE *value, Py_ssize_t *value_len): |
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
90 |
cdef int i |
91 |
for i from 0 <= i < len: |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
92 |
if line[i] == c':': |
93 |
if line[i+1] != c' ': |
|
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
94 |
raise ValueError("invalid tag in line %r" % |
95 |
PyUnicode_FromUnicode(line, len)) |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
96 |
memcpy(value, &line[i+2], (len-i-2) * sizeof(Py_UNICODE)) |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
97 |
value_len[0] = len-i-2 |
98 |
return PyUnicode_EncodeASCII(line, i, "strict") |
|
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
99 |
raise ValueError("tag/value separator not found in line %r" % |
100 |
PyUnicode_FromUnicode(line, len)) |
|
101 |
||
102 |
||
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
103 |
def _read_stanza_utf8(line_iter): |
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
104 |
cdef char *c_line |
4354.3.4
by Jelmer Vernooij
More work using C API's rather than Python objects. |
105 |
cdef Py_ssize_t c_len |
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
106 |
cdef char *accum_value, *new_accum_value |
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
107 |
cdef Py_ssize_t accum_len, accum_size |
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
108 |
pairs = [] |
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
109 |
tag = None |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
110 |
accum_len = 0 |
111 |
accum_size = 4096 |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
112 |
accum_value = <char *>malloc(accum_size) |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
113 |
if accum_value == NULL: |
114 |
raise MemoryError |
|
115 |
try: |
|
116 |
for line in line_iter: |
|
117 |
if line is None: |
|
118 |
break # end of file |
|
119 |
if not PyString_CheckExact(line): |
|
120 |
raise TypeError("%r is not a plain string" % line) |
|
121 |
c_line = PyString_AS_STRING(line) |
|
122 |
c_len = PyString_GET_SIZE(line) |
|
123 |
if c_len < 1: |
|
124 |
break # end of file |
|
125 |
if c_len == 1 and c_line[0] == c"\n": |
|
126 |
break # end of stanza |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
127 |
if accum_len + c_len > accum_size: |
128 |
accum_size = (accum_len + c_len) |
|
129 |
new_accum_value = <char *>realloc(accum_value, accum_size) |
|
130 |
if new_accum_value == NULL: |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
131 |
raise MemoryError |
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
132 |
else: |
133 |
accum_value = new_accum_value |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
134 |
if c_line[0] == c'\t': # continues previous value |
135 |
if tag is None: |
|
136 |
raise ValueError('invalid continuation line %r' % line) |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
137 |
memcpy(accum_value+accum_len, c_line+1, c_len-1) |
138 |
accum_len = accum_len + c_len-1 |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
139 |
else: # new tag:value line |
140 |
if tag is not None: |
|
141 |
PyList_Append(pairs, |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
142 |
(tag, PyUnicode_DecodeUTF8(accum_value, accum_len-1, |
143 |
"strict"))) |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
144 |
tag = _split_first_line_utf8(c_line, c_len, accum_value, |
145 |
&accum_len) |
|
146 |
if not _valid_tag(tag): |
|
147 |
raise ValueError("invalid rio tag %r" % (tag,)) |
|
148 |
if tag is not None: # add last tag-value |
|
149 |
PyList_Append(pairs, |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
150 |
(tag, PyUnicode_DecodeUTF8(accum_value, accum_len-1, "strict"))) |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
151 |
return Stanza.from_pairs(pairs) |
152 |
else: # didn't see any content |
|
153 |
return None |
|
154 |
finally: |
|
155 |
free(accum_value) |
|
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
156 |
|
157 |
||
158 |
def _read_stanza_unicode(unicode_iter): |
|
4354.3.10
by Jelmer Vernooij
Use Py_UNICODE in unicode RIO parser. |
159 |
cdef Py_UNICODE *c_line |
160 |
cdef int c_len |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
161 |
cdef Py_UNICODE *accum_value, *new_accum_value |
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
162 |
cdef Py_ssize_t accum_len, accum_size |
4354.3.3
by Jelmer Vernooij
More performance tweaks. |
163 |
pairs = [] |
4354.3.2
by Jelmer Vernooij
Provide custom implementation of _read_stanza_utf8 in Pyrex. |
164 |
tag = None |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
165 |
accum_len = 0 |
166 |
accum_size = 4096 |
|
167 |
accum_value = <Py_UNICODE *>malloc(accum_size*sizeof(Py_UNICODE)) |
|
168 |
if accum_value == NULL: |
|
169 |
raise MemoryError |
|
170 |
try: |
|
171 |
for line in unicode_iter: |
|
172 |
if line is None: |
|
173 |
break # end of file |
|
174 |
if not PyUnicode_CheckExact(line): |
|
175 |
raise TypeError("%r is not a unicode string" % line) |
|
176 |
c_line = PyUnicode_AS_UNICODE(line) |
|
177 |
c_len = PyUnicode_GET_SIZE(line) |
|
178 |
if c_len < 1: |
|
179 |
break # end of file |
|
180 |
if Py_UNICODE_ISLINEBREAK(c_line[0]): |
|
181 |
break # end of stanza |
|
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
182 |
if accum_len + c_len > accum_size: |
183 |
accum_size = accum_len + c_len |
|
184 |
new_accum_value = <Py_UNICODE *>realloc(accum_value, |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
185 |
accum_size*sizeof(Py_UNICODE)) |
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
186 |
if new_accum_value == NULL: |
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
187 |
raise MemoryError |
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
188 |
else: |
189 |
accum_value = new_accum_value |
|
190 |
if c_line[0] == c'\t': # continues previous value, |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
191 |
if tag is None: |
192 |
raise ValueError('invalid continuation line %r' % line) |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
193 |
memcpy(&accum_value[accum_len], &c_line[1], |
4354.3.14
by Jelmer Vernooij
Review feedback from John. |
194 |
(c_len-1)*sizeof(Py_UNICODE)) |
195 |
accum_len = accum_len + (c_len-1) |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
196 |
else: # new tag:value line |
197 |
if tag is not None: |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
198 |
PyList_Append(pairs, |
199 |
(tag, PyUnicode_FromUnicode(accum_value, accum_len-1))) |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
200 |
tag = _split_first_line_unicode(c_line, c_len, accum_value, |
201 |
&accum_len) |
|
202 |
if not _valid_tag(tag): |
|
203 |
raise ValueError("invalid rio tag %r" % (tag,)) |
|
204 |
if tag is not None: # add last tag-value |
|
4354.3.13
by Jelmer Vernooij
Add more RIO tests, fix bugs in pyrex implementation. |
205 |
PyList_Append(pairs, |
206 |
(tag, PyUnicode_FromUnicode(accum_value, accum_len-1))) |
|
4354.3.11
by Jelmer Vernooij
Use shared data area when parsing pairs in stanza. |
207 |
return Stanza.from_pairs(pairs) |
208 |
else: # didn't see any content |
|
209 |
return None |
|
210 |
finally: |
|
211 |
free(accum_value) |