~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2008-07-24 05:18:51 UTC
mto: (0.17.18 trunk)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20080724051851-5c96p66uwasrhp1v

A bit more work, not really usable yet.
I *do* have an idea of where I want to go with this.
And I think it can be quite a bit simpler than the patience-diff hash tables.

files modified:
_groupcompress_c.pyx

groupcompress.py

Show diffs side-by-side

added added

removed removed

_groupcompress_c.pyx

"""Compiled extensions for doing compression."""

cdef extern from *:

ctypedef unsigned long size_t

void * malloc(size_t)

void free(void *)

cdef extern from "Python.h":

struct _PyObject:

pass

ctypedef _PyObject PyObject

PyObject *PySequence_Fast(object, char *) except NULL

Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)

PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)

long PyObject_Hash(PyObject *) except -1

void Py_DECREF(PyObject *)

void Py_INCREF(PyObject *)

cdef struct _raw_line:

long hash # Cached form of the hash for this entry

int next # Next line which is equivalent to this one

PyObject *data # Raw pointer to the original line

cdef struct _hash_bucket:

int line_index # First line in the left side for this bucket

int count # Number of equivalent lines

cdef int SENTINEL

SENTINEL = -1

cdef void *safe_malloc(size_t count) except NULL:

cdef void *result

result = malloc(count)

if result == NULL:

raise MemoryError('Failed to allocate %d bytes of memory' % (count,))

return result

cdef class EquivalenceTable:

"""This tracks equivalencies between lists of hashable objects.

cdef readonly object lines

cdef readonly object _right_lines

cdef object _matching_lines

cdef int _hashtable_size

cdef int _hashtable_bitmask

cdef _hash_bucket *_hashtable

cdef _raw_line *_raw_left_lines

cdef Py_ssize_t _len_left_lines

cdef _raw_line *_raw_right_lines

cdef Py_ssize_t _len_right_lines

def __init__(self, lines):

self.lines = lines

self._len_left_lines = len(lines)

self._right_lines = None

self._len_right_lines = 0

self._hashtable_size = 0

self._hashtable = NULL

self._raw_left_lines = NULL

self._raw_right_lines = NULL

self._lines_to_raw_lines(lines, &self._raw_left_lines)

self._generate_matching_lines()

def __dealloc__(self):

if self._hashtable != NULL:

free(self._hashtable)

self._hashtable = NULL

if self._raw_left_lines != NULL:

free(self._raw_left_lines)

self._raw_left_lines = NULL

if self._raw_right_lines != NULL:

free(self._raw_right_lines)

self._raw_right_lines = NULL

cdef int _lines_to_raw_lines(self, object lines,

100

_raw_line **raw_lines) except -1:

101

"""Load a sequence of objects into the _raw_line format"""

102

cdef Py_ssize_t count, i

103

cdef PyObject *seq

104

cdef PyObject *item

105

cdef _raw_line *raw

106

107

# Do we want to use PySequence_Fast, or just assume that it is a list

108

# Also, do we need to decref the return value?

109

# http://www.python.org/doc/current/api/sequence.html

110

seq = PySequence_Fast(lines, "expected a sequence")

111

try:

112

count = PySequence_Fast_GET_SIZE(seq)

113

if count == 0:

114

return 0

115

raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))

116

# This should set _raw_left/right_lines, which means that we should

117

# automatically clean it up during __dealloc__

118

raw_lines[0] = raw

119

120

for i from 0 <= i < count:

121

item = PySequence_Fast_GET_ITEM(seq, i)

122

# NB: We don't Py_INCREF the data pointer, because we know we

123

# maintain a pointer to the item in self.lines or

124

# self._right_lines

125

# TODO: Do we even need to track a data pointer here? It would

126

# be less memory, and we *could* just look it up in the

127

# appropriate line list.

128

raw[i].data = item

129

raw[i].hash = PyObject_Hash(item)

130

raw[i].next = SENTINEL

131

finally:

132

# TODO: Unfortunately, try/finally always generates a compiler

133

# warning about a possibly unused variable :(

134

Py_DECREF(seq)

135

return count

136

137

138

cdef Py_ssize_t _compute_hash_size(self, Py_ssize_t needed_lines):

139

"""Figure out what the optimal size of the hash table is.

140

141

The hash size will always be a power of two, and is generally the next

142

power of two larger than the input lines.

143

The only trick is that we are likely to grow the hash table from time

144

to time, so we probably want to allocate a bit extra memory, so that we

145

don't rebuild the hash table often

146

"""

147

cdef Py_ssize_t hash_size

148

149

hash_size = 1

150

while hash_size < needed_lines:

151

hash_size = hash_size << 1

152

return hash_size

153

154

cdef int _build_hash_table(self) except -1:

155

cdef Py_ssize_t hash_size

156

cdef Py_ssize_t hash_bitmask

157

cdef Py_ssize_t i

158

159

# Hash size is a power of 2

160

hash_size = self._compute_hash_size(self._len_left_lines)

161

162

if self._hashtable != NULL:

163

free(self._hashtable)

164

self._hashtable = NULL

165

self._hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *

166

hash_size)

167

for i from 0 <= i < hash_size:

168

self._hashtable[i].line_index = SENTINEL

169

self._hashtable[i].count = 0

170

171

# Turn the hash size into a bitmask

172

self._hashtable_bitmask = hash_size - 1

173

174

# Iterate backwards, because it makes it easier to insert items into

175

# the hash (you just change the head pointer, and everything else keeps

176

# pointing to the same location).

177

for i from self._len_left_lines > i >= 0:

178

self._find_equivalence_offset(&self._raw_left_lines[i])

179

180

cdef int _find_equivalence_offset(self, _raw_line *line):

181

"""Find the node in the hash which defines this line.

182

183

Each bucket in the hash table points at exactly 1 equivalent line. If 2

184

objects would collide, we just increment to the next bucket.

185

"""

186

187

188

def _generate_matching_lines(self):

189

matches = {}

190

for idx, line in enumerate(self.lines):

Older »