17
17
"""Compiled extensions for doing compression."""
20
ctypedef unsigned long size_t
24
cdef extern from "Python.h":
27
ctypedef _PyObject PyObject
28
PyObject *PySequence_Fast(object, char *) except NULL
29
Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
30
PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
31
long PyObject_Hash(PyObject *) except -1
32
void Py_DECREF(PyObject *)
33
void Py_INCREF(PyObject *)
36
cdef struct _raw_line:
37
long hash # Cached form of the hash for this entry
38
int next # Next line which is equivalent to this one
39
PyObject *data # Raw pointer to the original line
42
cdef struct _hash_bucket:
43
int line_index # First line in the left side for this bucket
44
int count # Number of equivalent lines
51
cdef void *safe_malloc(size_t count) except NULL:
53
result = malloc(count)
55
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
20
59
cdef class EquivalenceTable:
21
60
"""This tracks equivalencies between lists of hashable objects.
26
65
cdef readonly object lines
27
66
cdef readonly object _right_lines
28
67
cdef object _matching_lines
68
cdef int _hashtable_size
69
cdef int _hashtable_bitmask
70
cdef _hash_bucket *_hashtable
71
cdef _raw_line *_raw_left_lines
72
cdef Py_ssize_t _len_left_lines
73
cdef _raw_line *_raw_right_lines
74
cdef Py_ssize_t _len_right_lines
30
76
def __init__(self, lines):
78
self._len_left_lines = len(lines)
32
79
self._right_lines = None
80
self._len_right_lines = 0
81
self._hashtable_size = 0
82
self._hashtable = NULL
83
self._raw_left_lines = NULL
84
self._raw_right_lines = NULL
85
self._lines_to_raw_lines(lines, &self._raw_left_lines)
33
86
self._generate_matching_lines()
88
def __dealloc__(self):
89
if self._hashtable != NULL:
91
self._hashtable = NULL
92
if self._raw_left_lines != NULL:
93
free(self._raw_left_lines)
94
self._raw_left_lines = NULL
95
if self._raw_right_lines != NULL:
96
free(self._raw_right_lines)
97
self._raw_right_lines = NULL
99
cdef int _lines_to_raw_lines(self, object lines,
100
_raw_line **raw_lines) except -1:
101
"""Load a sequence of objects into the _raw_line format"""
102
cdef Py_ssize_t count, i
107
# Do we want to use PySequence_Fast, or just assume that it is a list
108
# Also, do we need to decref the return value?
109
# http://www.python.org/doc/current/api/sequence.html
110
seq = PySequence_Fast(lines, "expected a sequence")
112
count = PySequence_Fast_GET_SIZE(seq)
115
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
116
# This should set _raw_left/right_lines, which means that we should
117
# automatically clean it up during __dealloc__
120
for i from 0 <= i < count:
121
item = PySequence_Fast_GET_ITEM(seq, i)
122
# NB: We don't Py_INCREF the data pointer, because we know we
123
# maintain a pointer to the item in self.lines or
125
# TODO: Do we even need to track a data pointer here? It would
126
# be less memory, and we *could* just look it up in the
127
# appropriate line list.
129
raw[i].hash = PyObject_Hash(item)
130
raw[i].next = SENTINEL
132
# TODO: Unfortunately, try/finally always generates a compiler
133
# warning about a possibly unused variable :(
138
cdef Py_ssize_t _compute_hash_size(self, Py_ssize_t needed_lines):
139
"""Figure out what the optimal size of the hash table is.
141
The hash size will always be a power of two, and is generally the next
142
power of two larger than the input lines.
143
The only trick is that we are likely to grow the hash table from time
144
to time, so we probably want to allocate a bit extra memory, so that we
145
don't rebuild the hash table often
147
cdef Py_ssize_t hash_size
150
while hash_size < needed_lines:
151
hash_size = hash_size << 1
154
cdef int _build_hash_table(self) except -1:
155
cdef Py_ssize_t hash_size
156
cdef Py_ssize_t hash_bitmask
159
# Hash size is a power of 2
160
hash_size = self._compute_hash_size(self._len_left_lines)
162
if self._hashtable != NULL:
163
free(self._hashtable)
164
self._hashtable = NULL
165
self._hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
167
for i from 0 <= i < hash_size:
168
self._hashtable[i].line_index = SENTINEL
169
self._hashtable[i].count = 0
171
# Turn the hash size into a bitmask
172
self._hashtable_bitmask = hash_size - 1
174
# Iterate backwards, because it makes it easier to insert items into
175
# the hash (you just change the head pointer, and everything else keeps
176
# pointing to the same location).
177
for i from self._len_left_lines > i >= 0:
178
self._find_equivalence_offset(&self._raw_left_lines[i])
180
cdef int _find_equivalence_offset(self, _raw_line *line):
181
"""Find the node in the hash which defines this line.
183
Each bucket in the hash table points at exactly 1 equivalent line. If 2
184
objects would collide, we just increment to the next bucket.
35
188
def _generate_matching_lines(self):
37
190
for idx, line in enumerate(self.lines):