1
# Copyright (C) 2008 Canonical Limited.
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
# GNU General Public License for more details.
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
20
ctypedef unsigned long size_t
22
void * realloc(void *, size_t)
25
cdef extern from "Python.h":
28
ctypedef _PyObject PyObject
29
PyObject *PySequence_Fast(object, char *) except NULL
30
Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
31
PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
32
PyObject *PyList_GET_ITEM(object, Py_ssize_t)
33
int PyList_Append(object, object) except -1
34
long PyObject_Hash(PyObject *) except -1
35
# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
36
# if there is an exception *for* us.
37
int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1
38
int PyObject_Not(PyObject *) except -1
39
void Py_DECREF(PyObject *)
40
void Py_INCREF(PyObject *)
42
cdef enum _raw_line_flags:
45
cdef struct _raw_line:
46
long hash # Cached form of the hash for this entry
47
Py_ssize_t hash_offset # The location in the hash table for this object
48
Py_ssize_t next_line_index # Next line which is equivalent to this one
49
int flags # status flags
50
PyObject *data # Raw pointer to the original line
53
cdef struct _hash_bucket:
54
Py_ssize_t line_index # First line in the left side for this bucket
55
Py_ssize_t count # Number of equivalent lines, DO we even need this?
58
cdef Py_ssize_t SENTINEL
62
cdef void *safe_malloc(size_t count) except NULL:
64
result = malloc(count)
66
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
72
result = realloc(old, count)
74
raise MemoryError('Failed to reallocate to %d bytes of memory'
79
cdef int safe_free(void **val) except -1:
86
cdef class EquivalenceTable:
87
"""This tracks equivalencies between lists of hashable objects.
89
:ivar lines: The 'static' lines that will be preserved between runs.
92
cdef readonly object lines
93
cdef readonly object _right_lines
94
cdef Py_ssize_t _hashtable_size
95
cdef Py_ssize_t _hashtable_bitmask
96
cdef _hash_bucket *_hashtable
97
cdef _raw_line *_raw_lines
98
cdef Py_ssize_t _len_lines
100
def __init__(self, lines):
101
self.lines = list(lines)
102
self._right_lines = None
103
self._hashtable_size = 0
104
self._hashtable = NULL
105
self._raw_lines = NULL
107
self._lines_to_raw_lines(lines)
108
self._build_hash_table()
110
def __dealloc__(self):
111
safe_free(<void**>&self._hashtable)
112
safe_free(<void**>&self._raw_lines)
114
cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
115
"""Convert a single PyObject into a raw line."""
116
raw_line.hash = PyObject_Hash(line)
117
raw_line.next_line_index = SENTINEL
118
raw_line.hash_offset = SENTINEL
119
raw_line.flags = INDEXED
122
cdef int _lines_to_raw_lines(self, object lines) except -1:
123
"""Load a sequence of objects into the _raw_line format"""
124
cdef Py_ssize_t count, i
129
# Do we want to use PySequence_Fast, or just assume that it is a list
130
# Also, do we need to decref the return value?
131
# http://www.python.org/doc/current/api/sequence.html
132
seq = PySequence_Fast(lines, "expected a sequence")
134
count = PySequence_Fast_GET_SIZE(seq)
137
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
138
safe_free(<void**>&self._raw_lines)
139
self._raw_lines = raw
140
self._len_lines = count
142
for i from 0 <= i < count:
143
item = PySequence_Fast_GET_ITEM(seq, i)
144
# NB: We don't Py_INCREF the data pointer, because we know we
145
# maintain a pointer to the item in self.lines or
147
# TODO: Do we even need to track a data pointer here? It would
148
# be less memory, and we *could* just look it up in the
149
# appropriate line list.
150
self._line_to_raw_line(item, &raw[i])
152
# TODO: Unfortunately, try/finally always generates a compiler
153
# warning about a possibly unused variable :(
157
cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
158
"""Determine the smallest hash size that can reasonably fit the data.
160
:param needed: The number of entries we might be inserting into
161
the hash (assuming there are no duplicate lines.)
162
:return: The number of hash table entries to use. This will always be a
165
cdef Py_ssize_t hash_size
166
cdef Py_ssize_t min_size
168
# TODO: Another alternative would be to actually count how full the
169
# hash-table is, and decide if we need to grow it based on
170
# density. That can take into account duplicated lines. Though if
171
# we compress well, there should be a minimal amount of
172
# duplicated lines in the output.
174
# At the bare minimum, we could fit all entries into a 'needed'
175
# size hash table. However, any collision would then have a long way to
176
# traverse before it could find a 'free' slot.
177
# So we set the minimum size to give us 33% empty slots.
178
min_size = <Py_ssize_t>(needed * 1.5)
180
while hash_size < min_size:
181
hash_size = hash_size << 1
184
def _py_compute_minimum_hash_size(self, needed):
185
"""Expose _compute_minimum_hash_size to python for testing."""
186
return self._compute_minimum_hash_size(needed)
188
cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
189
"""Determine a reasonable hash size, assuming some room for growth.
191
:param needed: The number of entries we might be inserting into
192
the hash (assuming there are no duplicate lines.)
193
:return: The number of hash table entries to use. This will always be a
196
cdef Py_ssize_t hash_size
197
cdef Py_ssize_t min_size
199
# We start off with a 8k hash (after doubling), because there isn't a
200
# lot of reason to go smaller than that (this class isn't one you'll be
201
# instantiating thousands of, and you are always likely to grow here.)
203
while hash_size < needed:
204
hash_size = hash_size << 1
205
# And we always give at least 2x blank space
206
hash_size = hash_size << 1
209
def _py_compute_recommended_hash_size(self, needed):
210
"""Expose _compute_recommended_hash_size to python for testing."""
211
return self._compute_recommended_hash_size(needed)
213
cdef int _build_hash_table(self) except -1:
214
"""Build the hash table 'from scratch'."""
215
cdef Py_ssize_t hash_size
216
cdef Py_ssize_t hash_bitmask
218
cdef _raw_line *cur_line
219
cdef Py_ssize_t hash_offset
220
cdef _hash_bucket *cur_bucket
221
cdef _hash_bucket *new_hashtable
223
# Hash size is a power of 2
224
hash_size = self._compute_recommended_hash_size(self._len_lines)
226
new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
228
safe_free(<void**>&self._hashtable)
229
self._hashtable = new_hashtable
231
self._hashtable_size = hash_size
232
for i from 0 <= i < hash_size:
233
self._hashtable[i].line_index = SENTINEL
234
self._hashtable[i].count = 0
236
# Turn the hash size into a bitmask
237
self._hashtable_bitmask = hash_size - 1
239
# Iterate backwards, because it makes it easier to insert items into
240
# the hash (you just change the head pointer, and everything else keeps
241
# pointing to the same location).
242
for i from self._len_lines > i >= 0:
243
cur_line = self._raw_lines + i
244
if not (cur_line.flags & INDEXED == INDEXED):
246
hash_offset = self._find_hash_position(cur_line)
248
# Point this line to the location in the hash table
249
cur_line.hash_offset = hash_offset
250
# And make this line the head of the hash table
251
cur_bucket = self._hashtable + hash_offset
252
cur_line.next_line_index = cur_bucket.line_index
253
cur_bucket.line_index = i
254
cur_bucket.count = cur_bucket.count + 1
256
cdef int _extend_raw_lines(self, object index) except -1:
257
"""Add the last N lines from self.lines into the raw_lines array."""
258
cdef Py_ssize_t new_count
259
cdef Py_ssize_t new_total_len
260
cdef Py_ssize_t old_len
262
cdef PyObject *should_index
264
cdef Py_ssize_t line_index
265
cdef _raw_line *cur_line
266
cdef PyObject *seq_index
268
seq_index = PySequence_Fast(index, "expected a sequence for index")
270
old_len = self._len_lines
271
new_count = PySequence_Fast_GET_SIZE(seq_index)
272
new_total_len = new_count + self._len_lines
273
self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
274
new_total_len * sizeof(_raw_line))
275
self._len_lines = new_total_len
276
# Now that we have enough space, start adding the new lines
277
# into the array. These are done in forward order.
278
for i from 0 <= i < new_count:
279
line_index = i + old_len
280
cur_line = self._raw_lines + line_index
281
item = PyList_GET_ITEM(self.lines, line_index)
282
self._line_to_raw_line(item, cur_line)
283
should_index = PySequence_Fast_GET_ITEM(seq_index, i)
284
if PyObject_Not(should_index):
285
cur_line.flags = cur_line.flags & ~(<int>INDEXED)
289
cdef int _extend_hash_table(self, Py_ssize_t count) except -1:
290
"""Add the last N entries in self.lines to the hash table.
292
:param index: A sequence that declares whether each node should be
295
cdef Py_ssize_t line_index
296
cdef Py_ssize_t start
297
cdef _raw_line *cur_line
298
cdef _raw_line *next_line
299
cdef Py_ssize_t hash_offset
300
cdef _hash_bucket *cur_bucket
302
start = self._len_lines - count
304
for line_index from start <= line_index < self._len_lines:
305
cur_line = self._raw_lines + line_index
306
if cur_line.flags & INDEXED != INDEXED:
308
hash_offset = self._find_hash_position(cur_line)
310
# Point this line to the location in the hash table
311
cur_line.hash_offset = hash_offset
313
# Make this line the tail of the hash table
314
cur_bucket = self._hashtable + hash_offset
315
cur_bucket.count = cur_bucket.count + 1
316
if cur_bucket.line_index == SENTINEL:
317
cur_bucket.line_index = line_index
319
# We need to track through the pointers and insert this at
321
next_line = self._raw_lines + cur_bucket.line_index
322
while next_line.next_line_index != SENTINEL:
323
next_line = self._raw_lines + next_line.next_line_index
324
next_line.next_line_index = line_index
326
cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
327
"""Find the node in the hash which defines this line.
329
Each bucket in the hash table points at exactly 1 equivalent line. If 2
330
objects would collide, we just increment to the next bucket until we
331
get to an empty bucket that is either empty or exactly matches this
334
:return: The offset in the hash table for this entry
336
cdef Py_ssize_t location
337
cdef _raw_line *ref_line
338
cdef Py_ssize_t ref_index
339
cdef int compare_result
341
location = line.hash & self._hashtable_bitmask
342
ref_index = self._hashtable[location].line_index
343
while ref_index != SENTINEL:
344
ref_line = self._raw_lines + ref_index
345
if (ref_line.hash == line.hash):
346
PyObject_Cmp(ref_line.data, line.data, &compare_result)
347
if compare_result == 0:
349
location = (location + 1) & self._hashtable_bitmask
350
ref_index = self._hashtable[location].line_index
353
def _py_find_hash_position(self, line):
354
"""Used only for testing.
356
Return the location where this fits in the hash table
358
cdef _raw_line raw_line
360
self._line_to_raw_line(<PyObject *>line, &raw_line)
361
return self._find_hash_position(&raw_line)
363
def _inspect_left_lines(self):
364
"""Used only for testing.
366
:return: None if _raw_lines is NULL,
367
else [(object, hash_val, hash_loc, next_val)] for each node in raw
372
if self._raw_lines == NULL:
376
for i from 0 <= i < self._len_lines:
377
PyList_Append(result,
378
(<object>self._raw_lines[i].data,
379
self._raw_lines[i].hash,
380
self._raw_lines[i].hash_offset,
381
self._raw_lines[i].next_line_index,
385
def _inspect_hash_table(self):
386
"""Used only for testing.
388
This iterates the hash table, and returns 'interesting' entries.
389
:return: (total_size, [(offset, line_index, count)] for entries that
395
for i from 0 <= i < self._hashtable_size:
396
if self._hashtable[i].line_index != SENTINEL:
397
PyList_Append(interesting,
398
(i, self._hashtable[i].line_index,
399
self._hashtable[i].count))
400
return (self._hashtable_size, interesting)
402
def get_matches(self, line):
403
"""Return the lines which match the line in right."""
404
cdef Py_ssize_t count
406
cdef Py_ssize_t *matches
411
count = self.get_raw_matches(<PyObject *>line, &matches)
415
for i from 0 <= i < count:
416
PyList_Append(result, matches[i])
418
safe_free(<void**>&matches)
421
cdef Py_ssize_t get_raw_matches(self, PyObject *line,
422
Py_ssize_t **matches) except -1:
423
"""Get all the possible entries that match the given location.
425
:param line: A Python line which we want to match
426
:param pos: The index position in _right_lines
427
:param matches: A variable to hold an array containing all the matches.
428
This will be allocated using malloc, and the caller is responsible
429
to free it with free(). If there are no matches, no memory will be
431
:retrun: The number of matched positions
433
cdef Py_ssize_t hash_offset
434
cdef _raw_line raw_line
435
cdef _hash_bucket cur_bucket
436
cdef Py_ssize_t cur_line_idx
437
cdef Py_ssize_t count
438
cdef Py_ssize_t *local_matches
440
self._line_to_raw_line(line, &raw_line)
441
hash_offset = self._find_hash_position(&raw_line)
442
cur_bucket = self._hashtable[hash_offset]
443
cur_line_idx = cur_bucket.line_index
444
if cur_line_idx == SENTINEL:
446
local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *
447
(cur_bucket.count + 1))
448
matches[0] = local_matches
449
for count from 0 <= count < cur_bucket.count:
450
assert cur_line_idx != SENTINEL
451
local_matches[count] = cur_line_idx
452
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
453
return cur_bucket.count
455
def _get_matching_lines(self):
456
"""Return a dictionary showing matching lines."""
458
for line in self.lines:
459
matching[line] = self.get_matches(line)
462
def get_idx_matches(self, right_idx):
463
"""Return the left lines matching the right line at the given offset."""
464
line = self._right_lines[right_idx]
465
return self.get_matches(line)
467
def extend_lines(self, lines, index):
468
"""Add more lines to the left-lines list.
470
:param lines: A list of lines to add
471
:param index: A True/False for each node to define if it should be
474
cdef Py_ssize_t orig_len
475
cdef Py_ssize_t min_new_hash_size
476
assert len(lines) == len(index)
477
min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
479
orig_len = len(self.lines)
480
self.lines.extend(lines)
481
self._extend_raw_lines(index)
482
if self._hashtable_size >= min_new_hash_size:
483
# Just add the new lines, don't bother resizing the hash table
484
self._extend_hash_table(len(index))
486
self._build_hash_table()
488
def set_right_lines(self, lines):
489
"""Set the lines we will be matching against."""
490
self._right_lines = lines
493
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,
494
Py_ssize_t *new_values, Py_ssize_t new_count) except -1:
495
"""Intersect the base values with the new values.
497
It is assumed that both base and new_values are sorted in ascending order,
500
The values in 'base' will be modified to only contain the entries that are
501
in both base and new. If nothing matches, 'base' will not be modified
503
:param base: An array of base values
504
:param base_count: The length of the base array
505
:param new_values: An array of values to compare to base, will be modified
506
to only contain matching entries
507
:param new_count: The number of new entries
508
:return: The count of matching values
510
cdef Py_ssize_t *cur_base
511
cdef Py_ssize_t *cur_new
512
cdef Py_ssize_t *cur_out
513
cdef Py_ssize_t count
514
cdef Py_ssize_t *base_end
515
cdef Py_ssize_t *new_end
520
base_end = base + base_count
521
new_end = new_values + new_count
524
while cur_base < base_end and cur_new < new_end:
525
if cur_base[0] == cur_new[0]:
526
# This may be assigning to self.
527
cur_out[0] = cur_base[0]
529
cur_out = cur_out + 1
530
# We matched, so move both pointers
531
cur_base = cur_base + 1
532
cur_new = cur_new + 1
533
elif cur_base[0] < cur_new[0]:
534
# cur_base is "behind" so increment it
535
cur_base = cur_base + 1
537
# cur_new must be "behind" move its pointer
538
cur_new = cur_new + 1
542
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):
545
for i from 0 <= i < count:
550
def _get_longest_match(equivalence_table, pos, max_pos, locations):
551
"""Get the longest possible match for the current position."""
552
cdef EquivalenceTable eq
554
cdef Py_ssize_t cmax_pos
555
cdef Py_ssize_t *matches
556
cdef Py_ssize_t match_count
558
cdef Py_ssize_t *copy_ends
559
cdef Py_ssize_t copy_count
560
cdef Py_ssize_t range_len
561
cdef Py_ssize_t in_start
563
cdef Py_ssize_t intersect_count
565
eq = equivalence_table
575
# TODO: Handle when locations is not None
576
while cpos < cmax_pos:
577
# TODO: Instead of grabbing the full list of matches and then doing an
578
# intersection, use a helper that does the intersection
580
line = PyList_GET_ITEM(eq._right_lines, cpos)
581
safe_free(<void**>&matches)
582
match_count = eq.get_raw_matches(line, &matches)
584
# No more matches, just return whatever we have, but we know that
585
# this last position is not going to match anything
589
if copy_ends == NULL:
590
# We are starting a new range, just steal the matches
592
copy_count = match_count
595
for i from 0 <= i < copy_count:
596
copy_ends[i] = copy_ends[i] + 1
599
# We are currently in the middle of a match
600
intersect_count = intersect_sorted(copy_ends, copy_count,
601
matches, match_count)
602
if intersect_count == 0:
603
# But we are done with this match, we should be
604
# starting a new one, though. We will pass back 'locations'
605
# so that we don't have to do another lookup.
608
copy_count = intersect_count
609
# range continues, step the copy ends
610
for i from 0 <= i < copy_count:
611
copy_ends[i] = copy_ends[i] + 1
612
range_len = range_len + 1
613
safe_free(<void **>&matches)
617
# We consumed all of our matches
620
locations = array_as_list(matches, match_count)
621
safe_free(<void **>&matches)
622
if copy_ends == NULL or copy_count == 0:
623
# We never had a match
624
return None, cpos, locations
625
# Because copy_ends is always sorted, we know the 'min' is just the first
627
in_start = copy_ends[0]
628
safe_free(<void**>©_ends)
629
return ((in_start - range_len), range_start, range_len), cpos, locations