~bzr-pqm/bzr/bzr.dev : contents of bzrlib/_knit_load_data

~bzr-pqm/bzr/bzr.dev : (revision 2760)
# Copyright (C) 2007 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Pyrex extensions to knit parsing."""

import sys

from bzrlib import errors


cdef extern from "stdlib.h":
    ctypedef unsigned size_t
    long int strtol(char *nptr, char **endptr, int base)


cdef extern from "Python.h":
    int PyDict_CheckExact(object)
    void *PyDict_GetItem_void "PyDict_GetItem" (object p, object key)
    int PyDict_SetItem(object p, object key, object val) except -1

    int PyList_Append(object lst, object item) except -1
    object PyList_GET_ITEM(object lst, int index)
    int PyList_CheckExact(object)

    void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)

    char *PyString_AsString(object p)
    object PyString_FromStringAndSize(char *, int)
    int PyString_Size(object p)

    void Py_INCREF(object)


cdef extern from "string.h":
    void *memchr(void *s, int c, size_t n)


cdef int string_to_int_safe(char *s, char *end, int *out) except -1:
    """Convert a base10 string to an integer.

    This makes sure the whole string is consumed, or it raises ValueError.
    This is similar to how int(s) works, except you don't need a Python
    String object.

    :param s: The string to convert
    :param end: The character after the integer. So if the string is '12\0',
        this should be pointing at the '\0'. If the string was '12 ' then this
        should point at the ' '.
    :param out: This is the integer that will be returned
    :return: -1 if an exception is raised. 0 otherwise
    """
    cdef char *integer_end

    # We can't just return the integer because of how pyrex determines when
    # there is an exception.
    out[0] = <int>strtol(s, &integer_end, 10)
    if integer_end != end:
        py_s = PyString_FromStringAndSize(s, end-s)
        raise ValueError('%r is not a valid integer' % (py_s,))
    return 0


cdef class KnitIndexReader:

    cdef object kndx
    cdef object fp

    cdef object cache
    cdef object history

    cdef char * cur_str
    cdef char * end_str

    cdef int history_len

    def __new__(self, kndx, fp):
        self.kndx = kndx
        self.fp = fp

        self.cache = kndx._cache
        self.history = kndx._history

        self.cur_str = NULL
        self.end_str = NULL
        self.history_len = 0

    cdef void validate(self):
        if not PyDict_CheckExact(self.cache):
            raise TypeError('kndx._cache must be a python dict')
        if not PyList_CheckExact(self.history):
            raise TypeError('kndx._history must be a python list')

    cdef object process_options(self, char *option_str, char *end):
        """Process the options string into a list."""
        cdef char *next

        # This is alternative code which creates a python string and splits it.
        # It is "correct" and more obvious, but slower than the following code.
        # It can be uncommented to switch in case the other code is seen as
        # suspect.
        # options = PyString_FromStringAndSize(option_str,
        #                                      end - option_str)
        # return options.split(',')

        final_options = []

        while option_str < end:
            next = <char*>memchr(option_str, c',', end - option_str)
            if next == NULL:
                next = end
            next_option = PyString_FromStringAndSize(option_str,
                                                     next - option_str)
            PyList_Append(final_options, next_option)

            # Move past the ','
            option_str = next+1

        return final_options

    cdef object process_parents(self, char *parent_str, char *end):
        cdef char *next
        cdef int int_parent
        cdef char *parent_end

        # Alternative, correct but slower code.
        #
        # parents = PyString_FromStringAndSize(parent_str,
        #                                      end - parent_str)
        # real_parents = []
        # for parent in parents.split():
        #     if parent[0].startswith('.'):
        #         real_parents.append(parent[1:])
        #     else:
        #         real_parents.append(self.history[int(parent)])
        # return real_parents

        parents = []
        while parent_str <= end:
            next = <char*>memchr(parent_str, c' ', end - parent_str)
            if next == NULL or next >= end or next == parent_str:
                break

            if parent_str[0] == c'.':
                # This is an explicit revision id
                parent_str = parent_str + 1
                parent = PyString_FromStringAndSize(parent_str,
                                                    next - parent_str)
            else:
                # This in an integer mapping to original
                string_to_int_safe(parent_str, next, &int_parent)

                if int_parent >= self.history_len:
                    raise IndexError('Parent index refers to a revision which'
                        ' does not exist yet.'
                        ' %d > %d' % (int_parent, self.history_len))
                parent = PyList_GET_ITEM(self.history, int_parent)
                # PyList_GET_ITEM steals a reference
                Py_INCREF(parent)
            PyList_Append(parents, parent)
            parent_str = next + 1
        return parents

    cdef int process_one_record(self, char *start, char *end) except -1:
        """Take a simple string and split it into an index record."""
        cdef char *version_id_str
        cdef int version_id_size
        cdef char *option_str
        cdef char *option_end
        cdef char *pos_str
        cdef int pos
        cdef char *size_str
        cdef int size
        cdef char *parent_str
        cdef int parent_size
        cdef void *cache_entry

        version_id_str = start
        option_str = <char*>memchr(version_id_str, c' ', end - version_id_str)
        if option_str == NULL or option_str >= end:
            # Short entry
            return 0
        version_id_size = <int>(option_str - version_id_str)
        # Move past the space character
        option_str = option_str + 1

        pos_str = <char*>memchr(option_str, c' ', end - option_str)
        if pos_str == NULL or pos_str >= end:
            # Short entry
            return 0
        option_end = pos_str
        pos_str = pos_str + 1

        size_str = <char*>memchr(pos_str, c' ', end - pos_str)
        if size_str == NULL or size_str >= end:
            # Short entry
            return 0
        size_str = size_str + 1

        parent_str = <char*>memchr(size_str, c' ', end - size_str)
        if parent_str == NULL or parent_str >= end:
            # Missing parents
            return 0
        parent_str = parent_str + 1

        version_id = PyString_FromStringAndSize(version_id_str,
                                                version_id_size)
        options = self.process_options(option_str, option_end)

        try:
            string_to_int_safe(pos_str, size_str - 1, &pos)
            string_to_int_safe(size_str, parent_str - 1, &size)
            parents = self.process_parents(parent_str, end)
        except (ValueError, IndexError), e:
            py_line = PyString_FromStringAndSize(start, end - start)
            raise errors.KnitCorrupt(self.kndx._filename,
                                     "line %r: %s" % (py_line, e))

        cache_entry = PyDict_GetItem_void(self.cache, version_id)
        if cache_entry == NULL:
            PyList_Append(self.history, version_id)
            index = self.history_len
            self.history_len = self.history_len + 1
        else:
            # PyTuple_GetItem_void_void does *not* increment the reference
            # counter, but casting to <object> does.
            index = <object>PyTuple_GetItem_void_void(cache_entry, 5)

        PyDict_SetItem(self.cache, version_id,
                       (version_id,
                        options,
                        pos,
                        size,
                        parents,
                        index,
                       ))
        return 1

    cdef int process_next_record(self) except -1:
        """Process the next record in the file."""
        cdef char *last
        cdef char *start

        start = self.cur_str
        # Find the next newline
        last = <char*>memchr(start, c'\n', self.end_str - start)
        if last == NULL:
            # Process until the end of the file
            last = self.end_str - 1
            self.cur_str = self.end_str
        else:
            # The last character is right before the '\n'
            # And the next string is right after it
            self.cur_str = last + 1
            last = last - 1

        if last <= start or last[0] != c':':
            # Incomplete record
            return 0

        return self.process_one_record(start, last)

    def read(self):
        cdef int text_size

        self.validate()

        self.kndx.check_header(self.fp)

        # We read the whole thing at once
        # TODO: jam 2007-05-09 Consider reading incrementally rather than
        #       having to have the whole thing read up front.
        #       we already know that calling f.readlines() versus lots of
        #       f.readline() calls is faster.
        #       The other possibility is to avoid a Python String here
        #       completely. However self.fp may be a 'file-like' object
        #       it is not guaranteed to be a real file.
        text = self.fp.read()
        text_size = PyString_Size(text)
        self.cur_str = PyString_AsString(text)
        # This points to the last character in the string
        self.end_str = self.cur_str + text_size

        while self.cur_str < self.end_str:
            self.process_next_record()


def _load_data_c(kndx, fp):
    """Load the knit index file into memory."""
    reader = KnitIndexReader(kndx, fp)
    reader.read()