~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2007-05-09 16:49:44 UTC
mto: This revision was merged to the branch mainline in revision 2614.
Revision ID: john@arbash-meinel.com-20070509164944-8dzfnsgdamvujo4b

First step towards custom parsing.
By steping through the main string instead of splitting into lines
we save approx 40%
  test_read_50k_index_c          617ms
  test_read_50k_index_c_again    592ms
  test_read_50k_index_py         895ms
  test_read_50k_index_py_again  1038ms

files modified:
bzrlib/knit_c.pyx

Show diffs side-by-side

added added

removed removed

bzrlib/knit_c.pyx

"""Pyrex extensions to knit parsing."""

cdef extern from "stdlib.h":

long int strtol(char *nptr, char **endptr, int base)

unsigned long int strtoul(char *nptr, char **endptr, int base)

cdef extern from "Python.h":

int PyDict_CheckExact(object)

void *PyDict_GetItem(object p, object key)

int PyDict_SetItem(object p, object key, object val) except -1

int PyList_Append(object lst, object item) except -1

void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)

object PyList_GET_ITEM (object lst, int index)

int PyList_CheckExact(object)

int PyTuple_CheckExact(object)

void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)

object PyTuple_New(int)

int PyTuple_SetItem(object tpl, int offset, object val)

void PyTuple_SET_ITEM(object tpl, int offset, object val)

object PyTuple_Pack(int n, ...)

char *PyString_AsString(object p)

char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)

object PyString_FromString(char *)

object PyString_FromStringAndSize(char *, int)

int PyString_Size(object p)

int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)

int PyString_CheckExact(object p)

void Py_INCREF(object)

void Py_DECREF(object)

cdef extern from "string.h":

char *strchr(char *s1, char c)

int strncmp(char *s1, char *s2, int len)

int strcmp(char *s1, char *s2)

cdef class KnitIndexReader:

cdef object kndx

cdef object fp

cdef object cache

cdef object history

cdef object text

cdef char * text_str

cdef int text_size

cdef char * cur_str

cdef char * end_str

cdef int history_len

def __new__(self, kndx, fp):

self.kndx = kndx

self.fp = fp

self.cache = kndx._cache

self.history = kndx._history

self.text = None

self.text_str = NULL

self.text_size = 0

self.cur_str = NULL

self.end_str = NULL

self.history_len = 0

cdef void validate(self):

if not PyDict_CheckExact(self.cache):

raise TypeError('kndx._cache must be a python dict')

if not PyList_CheckExact(self.history):

raise TypeError('kndx._history must be a python list')

cdef void process_one_record(self, char *start, char *end):

"""Take a simple string and split it into an index record."""

cdef char *version_id_str

100

cdef int version_id_size

101

cdef char *option_str

102

cdef int option_size

103

cdef char *pos_str

104

cdef int pos

105

cdef char *size_str

106

cdef int size

107

cdef char *parent_str

108

cdef int parent_size

109

110

version_id_str = start

111

option_str = strchr(version_id_str, c' ')

112

if option_str == NULL or option_str >= end:

113

# Short entry

114

return

115

version_id_size = <int>(option_str - version_id_str)

116

# Move past the space character

117

option_str = option_str + 1

118

119

pos_str = strchr(option_str, c' ')

120

if pos_str == NULL or pos_str >= end:

121

# Short entry

122

return

123

option_size = <int>(pos_str - option_str)

124

pos_str = pos_str + 1

125

126

size_str = strchr(pos_str, c' ')

127

if size_str == NULL or size_str >= end:

128

# Short entry

129

return

130

size_str = size_str + 1

131

132

# TODO: Make sure this works when there are no parents

133

parent_str = strchr(size_str, c' ')

134

if parent_str == NULL or parent_str >= end:

135

# Missing parents

136

return

137

parent_str = parent_str + 1

138

139

version_id = PyString_FromStringAndSize(version_id_str,

140

version_id_size)

141

options = PyString_FromStringAndSize(option_str, option_size)

142

options = options.split(',')

143

144

pos = strtol(pos_str, NULL, 10)

145

size = strtol(size_str, NULL, 10)

146

147

# TODO: Check that we are actually reading integers

148

parents = PyString_FromStringAndSize(parent_str,

149

<int>(end - parent_str))

150

parents = parents.split()

151

real_parents = []

152

for parent in parents:

153

if parent[0].startswith('.'):

154

real_parents.append(parent[1:])

155

else:

156

real_parents.append(self.history[int(parent)])

157

158

if version_id not in self.cache:

159

self.history.append(version_id)

160

index = self.history_len

161

self.history_len = self.history_len + 1

162

else:

163

index = self.cache[version_id][5]

164

165

self.cache[version_id] = (version_id,

166

options,

167

pos,

168

size,

169

real_parents,

170

index,

171

)

172

173

cdef void process_next_record(self):

174

"""Process the next record in the file."""

175

cdef char *last

176

cdef char *start

177

178

start = self.cur_str

179

# Find the next newline

180

last = strchr(start, c'\n')

181

if last == NULL:

182

# Process until the end of the file

183

last = self.end_str-1

184

self.cur_str = self.end_str

185

line = PyString_FromStringAndSize(start, <int>(last - start))

186

ending = PyString_FromStringAndSize(last, 1)

187

else:

188

# The last character is right before the '\n'

189

# And the next string is right after it

190

line = PyString_FromStringAndSize(start, <int>(last - start))

191

self.cur_str = last + 1

192

last = last - 1

193

ending = PyString_FromStringAndSize(last, 3)

194

195

if last <= start or last[0] != c':':

196

# Incomplete record

197

return

198

199

self.process_one_record(start, last)

200

201

def read(self):

202

self.validate()

203

204

kndx = self.kndx

205

fp = self.fp

206

cache = self.cache

207

history = self.history

208

209

kndx.check_header(fp)

210

211

# We read the whole thing at once

212

# TODO: jam 2007-05-09 Consider reading incrementally rather than

213

# having to have the whole thing read up front.

214

# we already know that calling f.readlines() versus lots of

215

# f.readline() calls is faster.

216

self.text = fp.read()

217

self.text_str = PyString_AsString(self.text)

218

self.text_size = PyString_Size(self.text)

219

self.cur_str = self.text_str

220

# This points to the last character in the string

221

self.end_str = self.text_str + self.text_size

222

223

while self.cur_str < self.end_str:

224

self.process_next_record()

225

226

227

def _load_data_c(kndx, fp):

228

"""Load the knit index file into memory."""

cache = kndx._cache

history = kndx._history

kndx.check_header(fp)

# readlines reads the whole file at once:

# bad for transports like http, good for local disk

# we save 60 ms doing this one change (

# from calling readline each time to calling

# readlines once.

# probably what we want for nice behaviour on

# http is a incremental readlines that yields, or

# a check for local vs non local indexes,

history_top = len(history) - 1

for line in fp.readlines():

rec = line.split()

if len(rec) < 5 or rec[-1] != ':':

# corrupt line.

# FIXME: in the future we should determine if its a

# short write - and ignore it

# or a different failure, and raise. RBC 20060407

continue

parents = []

for value in rec[4:-1]:

if value[0] == '.':

# uncompressed reference

parent_id = value[1:]

else:

parent_id = history[int(value)]

parents.append(parent_id)

version_id, options, pos, size = rec[:4]

version_id = version_id

# See kndx._cache_version

# only want the _history index to reference the 1st

# index entry for version_id

if version_id not in cache:

history_top = history_top + 1

index = history_top

history.append(version_id)

else:

index = cache[version_id][5]

cache[version_id] = (version_id,

options.split(','),

int(pos),

int(size),

parents,

index)

# end kndx._cache_version

229

reader = KnitIndexReader(kndx, fp)

230

reader.read()

Older »