~bzr-pqm/bzr/bzr.dev

3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob
update FSF mailing address
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
16
#
17
18
"""Pyrex extensions for converting chunks to lines."""
19
20
#python2.4 support
21
cdef extern from "python-compat.h":
22
    pass
23
24
cdef extern from "stdlib.h":
25
    ctypedef unsigned size_t
26
27
cdef extern from "Python.h":
28
    ctypedef int Py_ssize_t # Required for older pyrex versions
29
    ctypedef struct PyObject:
30
        pass
31
    int PyList_Append(object lst, object item) except -1
32
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
33
    int PyString_CheckExact(object p)
34
    char *PyString_AS_STRING(object p)
35
    Py_ssize_t PyString_GET_SIZE(object p)
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
36
    object PyString_FromStringAndSize(char *c_str, Py_ssize_t len)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
37
38
cdef extern from "string.h":
39
    void *memchr(void *s, int c, size_t n)
40
41
42
def chunks_to_lines(chunks):
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
43
    """Re-split chunks into simple lines.
44
45
    Each entry in the result should contain a single newline at the end. Except
46
    for the last entry which may not have a final newline. If chunks is already
47
    a simple list of lines, we return it directly.
48
49
    :param chunks: An list/tuple of strings. If chunks is already a list of
50
        lines, then we will return it as-is.
51
    :return: A list of strings.
52
    """
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
53
    cdef char *c_str
54
    cdef char *newline
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
55
    cdef char *c_last
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
56
    cdef Py_ssize_t the_len
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
57
    cdef int last_no_newline
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
58
59
    # Check to see if the chunks are already lines
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
60
    last_no_newline = 0
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
61
    for chunk in chunks:
3890.2.10 by John Arbash Meinel
Change the python implementation to a friendlier implementation.
62
        if last_no_newline:
63
            # We have a chunk which followed a chunk without a newline, so this
64
            # is not a simple list of lines.
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
65
            break
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
66
        # Switching from PyString_AsStringAndSize to PyString_CheckExact and
67
        # then the macros GET_SIZE and AS_STRING saved us 40us / 470us.
68
        # It seems PyString_AsStringAndSize can actually trigger a conversion,
69
        # which we don't want anyway.
70
        if not PyString_CheckExact(chunk):
71
            raise TypeError('chunk is not a string')
72
        the_len = PyString_GET_SIZE(chunk)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
73
        if the_len == 0:
3890.2.16 by John Arbash Meinel
If we split into 2 loops, we get 440us for already lines, and the
74
            # An empty string is never a valid line
75
            break
76
        c_str = PyString_AS_STRING(chunk)
77
        c_last = c_str + the_len - 1
78
        newline = <char *>memchr(c_str, c'\n', the_len)
79
        if newline != c_last:
80
            if newline == NULL:
81
                # Missing a newline. Only valid as the last line
82
                last_no_newline = 1
83
            else:
84
                # There is a newline in the middle, we must resplit
85
                break
86
    else:
87
        # Everything was already a list of lines
88
        return chunks
89
90
    # We know we need to create a new list of lines
91
    lines = []
92
    tail = None # Any remainder from the previous chunk
93
    for chunk in chunks:
94
        if tail is not None:
95
            chunk = tail + chunk
96
            tail = None
97
        if not PyString_CheckExact(chunk):
98
            raise TypeError('chunk is not a string')
99
        the_len = PyString_GET_SIZE(chunk)
100
        if the_len == 0:
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
101
            # An empty string is never a valid line, and we don't need to
102
            # append anything
103
            continue
3890.2.11 by John Arbash Meinel
A bit more tweaking of the pyrex version. Shave off another 10% by
104
        c_str = PyString_AS_STRING(chunk)
3890.2.7 by John Arbash Meinel
A Pyrex extension is about 5x faster than the fastest python code I could write.
105
        c_last = c_str + the_len - 1
106
        newline = <char *>memchr(c_str, c'\n', the_len)
3890.2.15 by John Arbash Meinel
Update to do a single iteration over the chunks.
107
        if newline == c_last:
108
            # A simple line
109
            PyList_Append(lines, chunk)
110
        elif newline == NULL:
111
            # A chunk without a newline, if this is the last entry, then we
112
            # allow it
113
            tail = chunk
114
        else:
115
            # We have a newline in the middle, loop until we've consumed all
116
            # lines
117
            while newline != NULL:
118
                line = PyString_FromStringAndSize(c_str, newline - c_str + 1)
119
                PyList_Append(lines, line)
120
                c_str = newline + 1
121
                if c_str > c_last: # We are done
122
                    break
123
                the_len = c_last - c_str + 1
124
                newline = <char *>memchr(c_str, c'\n', the_len)
125
                if newline == NULL:
126
                    tail = PyString_FromStringAndSize(c_str, the_len)
127
                    break
128
    if tail is not None:
129
        PyList_Append(lines, tail)
130
    return lines