~bzr-pqm/bzr/bzr.dev : revision 0.17.18

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License version 2 as published

5

# by the Free Software Foundation.

6

#

7

# This program is distributed in the hope that it will be useful,

8

# but WITHOUT ANY WARRANTY; without even the implied warranty of

9

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

10

# GNU General Public License for more details.

11

#

12

# You should have received a copy of the GNU General Public License

13

# along with this program; if not, write to the Free Software

14

# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

15

#

16

17

"""Compiled extensions for doing compression."""

18

19

cdef extern from *:

20

ctypedef unsigned long size_t

21

void * malloc(size_t)

22

void * realloc(void *, size_t)

23

void free(void *)

24

25

cdef extern from "Python.h":

26

struct _PyObject:

27

pass

28

ctypedef _PyObject PyObject

29

PyObject *PySequence_Fast(object, char *) except NULL

30

Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)

31

PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)

32

PyObject *PyList_GET_ITEM(object, Py_ssize_t)

33

int PyList_Append(object, object) except -1

34

long PyObject_Hash(PyObject *) except -1

35

# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check

36

# if there is an exception *for* us.

37

int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1

38

int PyObject_Not(PyObject *) except -1

39

void Py_DECREF(PyObject *)

40

void Py_INCREF(PyObject *)

41

42

cdef enum _raw_line_flags:

43

INDEXED = 0x01

44

45

cdef struct _raw_line:

46

long hash # Cached form of the hash for this entry

47

Py_ssize_t hash_offset # The location in the hash table for this object

48

Py_ssize_t next_line_index # Next line which is equivalent to this one

49

int flags # status flags

50

PyObject *data # Raw pointer to the original line

51

52

53

cdef struct _hash_bucket:

54

Py_ssize_t line_index # First line in the left side for this bucket

55

Py_ssize_t count # Number of equivalent lines, DO we even need this?

56

57

58

cdef Py_ssize_t SENTINEL

59

SENTINEL = -1

60

61

62

cdef void *safe_malloc(size_t count) except NULL:

63

cdef void *result

64

result = malloc(count)

65

if result == NULL:

66

raise MemoryError('Failed to allocate %d bytes of memory' % (count,))

67

return result

68

69

70

cdef void *safe_realloc(void * old, size_t count) except NULL:

71

cdef void *result

72

result = realloc(old, count)

73

if result == NULL:

74

raise MemoryError('Failed to reallocate to %d bytes of memory'

75

% (count,))

76

return result

77

78

79

cdef int safe_free(void **val) except -1:

80

assert val != NULL

81

if val[0] != NULL:

82

free(val[0])

83

val[0] = NULL

84

85

86

cdef class EquivalenceTable:

87

"""This tracks equivalencies between lists of hashable objects.

88

89

:ivar lines: The 'static' lines that will be preserved between runs.

90

"""

91

92

cdef readonly object lines

93

cdef readonly object _right_lines

94

cdef Py_ssize_t _hashtable_size

95

cdef Py_ssize_t _hashtable_bitmask

96

cdef _hash_bucket *_hashtable

97

cdef _raw_line *_raw_lines

98

cdef Py_ssize_t _len_lines

99

100

def __init__(self, lines):

101

self.lines = list(lines)

102

self._right_lines = None

103

self._hashtable_size = 0

104

self._hashtable = NULL

105

self._raw_lines = NULL

106

self._len_lines = 0

107

self._lines_to_raw_lines(lines)

108

self._build_hash_table()

109

110

def __dealloc__(self):

111

safe_free(<void**>&self._hashtable)

112

safe_free(<void**>&self._raw_lines)

113

114

cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:

115

"""Convert a single PyObject into a raw line."""

116

raw_line.hash = PyObject_Hash(line)

117

raw_line.next_line_index = SENTINEL

118

raw_line.hash_offset = SENTINEL

119

raw_line.flags = INDEXED

120

raw_line.data = line

121

122

cdef int _lines_to_raw_lines(self, object lines) except -1:

123

"""Load a sequence of objects into the _raw_line format"""

124

cdef Py_ssize_t count, i

125

cdef PyObject *seq

126

cdef PyObject *item

127

cdef _raw_line *raw

128

129

# Do we want to use PySequence_Fast, or just assume that it is a list

130

# Also, do we need to decref the return value?

131

# http://www.python.org/doc/current/api/sequence.html

132

seq = PySequence_Fast(lines, "expected a sequence")

133

try:

134

count = PySequence_Fast_GET_SIZE(seq)

135

if count == 0:

136

return 0

137

raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))

138

safe_free(<void**>&self._raw_lines)

139

self._raw_lines = raw

140

self._len_lines = count

141

142

for i from 0 <= i < count:

143

item = PySequence_Fast_GET_ITEM(seq, i)

144

# NB: We don't Py_INCREF the data pointer, because we know we

145

# maintain a pointer to the item in self.lines or

146

# self._right_lines

147

# TODO: Do we even need to track a data pointer here? It would

148

# be less memory, and we *could* just look it up in the

149

# appropriate line list.

150

self._line_to_raw_line(item, &raw[i])

151

finally:

152

# TODO: Unfortunately, try/finally always generates a compiler

153

# warning about a possibly unused variable :(

154

Py_DECREF(seq)

155

return count

156

157

cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):

158

"""Determine the smallest hash size that can reasonably fit the data.

159

160

:param needed: The number of entries we might be inserting into

161

the hash (assuming there are no duplicate lines.)

162

:return: The number of hash table entries to use. This will always be a

163

power of two.

164

"""

165

cdef Py_ssize_t hash_size

166

cdef Py_ssize_t min_size

167

168

# TODO: Another alternative would be to actually count how full the

169

# hash-table is, and decide if we need to grow it based on

170

# density. That can take into account duplicated lines. Though if

171

# we compress well, there should be a minimal amount of

172

# duplicated lines in the output.

173

174

# At the bare minimum, we could fit all entries into a 'needed'

175

# size hash table. However, any collision would then have a long way to

176

# traverse before it could find a 'free' slot.

177

# So we set the minimum size to give us 33% empty slots.

178

min_size = <Py_ssize_t>(needed * 1.5)

179

hash_size = 1

180

while hash_size < min_size:

181

hash_size = hash_size << 1

182

return hash_size

183

184

def _py_compute_minimum_hash_size(self, needed):

185

"""Expose _compute_minimum_hash_size to python for testing."""

186

return self._compute_minimum_hash_size(needed)

187

188

cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):

189

"""Determine a reasonable hash size, assuming some room for growth.

190

191

:param needed: The number of entries we might be inserting into

192

the hash (assuming there are no duplicate lines.)

193

:return: The number of hash table entries to use. This will always be a

194

power of two.

195

"""

196

cdef Py_ssize_t hash_size

197

cdef Py_ssize_t min_size

198

199

# We start off with a 8k hash (after doubling), because there isn't a

200

# lot of reason to go smaller than that (this class isn't one you'll be

201

# instantiating thousands of, and you are always likely to grow here.)

202

hash_size = 4096

203

while hash_size < needed:

204

hash_size = hash_size << 1

205

# And we always give at least 2x blank space

206

hash_size = hash_size << 1

207

return hash_size

208

209

def _py_compute_recommended_hash_size(self, needed):

210

"""Expose _compute_recommended_hash_size to python for testing."""

211

return self._compute_recommended_hash_size(needed)

212

213

cdef int _build_hash_table(self) except -1:

214

"""Build the hash table 'from scratch'."""

215

cdef Py_ssize_t hash_size

216

cdef Py_ssize_t hash_bitmask

217

cdef Py_ssize_t i

218

cdef _raw_line *cur_line

219

cdef Py_ssize_t hash_offset

220

cdef _hash_bucket *cur_bucket

221

cdef _hash_bucket *new_hashtable

222

223

# Hash size is a power of 2

224

hash_size = self._compute_recommended_hash_size(self._len_lines)

225

226

new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *

227

hash_size)

228

safe_free(<void**>&self._hashtable)

229

self._hashtable = new_hashtable

230

231

self._hashtable_size = hash_size

232

for i from 0 <= i < hash_size:

233

self._hashtable[i].line_index = SENTINEL

234

self._hashtable[i].count = 0

235

236

# Turn the hash size into a bitmask

237

self._hashtable_bitmask = hash_size - 1

238

239

# Iterate backwards, because it makes it easier to insert items into

240

# the hash (you just change the head pointer, and everything else keeps

241

# pointing to the same location).

242

for i from self._len_lines > i >= 0:

243

cur_line = self._raw_lines + i

244

if not (cur_line.flags & INDEXED == INDEXED):

245

continue

246

hash_offset = self._find_hash_position(cur_line)

247

248

# Point this line to the location in the hash table

249

cur_line.hash_offset = hash_offset

250

# And make this line the head of the hash table

251

cur_bucket = self._hashtable + hash_offset

252

cur_line.next_line_index = cur_bucket.line_index

253

cur_bucket.line_index = i

254

cur_bucket.count = cur_bucket.count + 1

255

256

cdef int _extend_raw_lines(self, object index) except -1:

257

"""Add the last N lines from self.lines into the raw_lines array."""

258

cdef Py_ssize_t new_count

259

cdef Py_ssize_t new_total_len

260

cdef Py_ssize_t old_len

261

cdef PyObject *item

262

cdef PyObject *should_index

263

cdef Py_ssize_t i

264

cdef Py_ssize_t line_index

265

cdef _raw_line *cur_line

266

cdef PyObject *seq_index

267

268

seq_index = PySequence_Fast(index, "expected a sequence for index")

269

try:

270

old_len = self._len_lines

271

new_count = PySequence_Fast_GET_SIZE(seq_index)

272

new_total_len = new_count + self._len_lines

273

self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,

274

new_total_len * sizeof(_raw_line))

275

self._len_lines = new_total_len

276

# Now that we have enough space, start adding the new lines

277

# into the array. These are done in forward order.

278

for i from 0 <= i < new_count:

279

line_index = i + old_len

280

cur_line = self._raw_lines + line_index

281

item = PyList_GET_ITEM(self.lines, line_index)

282

self._line_to_raw_line(item, cur_line)

283

should_index = PySequence_Fast_GET_ITEM(seq_index, i)

284

if PyObject_Not(should_index):

285

cur_line.flags = cur_line.flags & ~(<int>INDEXED)

286

finally:

287

Py_DECREF(seq_index)

288

289

cdef int _extend_hash_table(self, Py_ssize_t count) except -1:

290

"""Add the last N entries in self.lines to the hash table.

291

292

:param index: A sequence that declares whether each node should be

293

INDEXED or not.

294

"""

295

cdef Py_ssize_t line_index

296

cdef Py_ssize_t start

297

cdef _raw_line *cur_line

298

cdef _raw_line *next_line

299

cdef Py_ssize_t hash_offset

300

cdef _hash_bucket *cur_bucket

301

302

start = self._len_lines - count

303

304

for line_index from start <= line_index < self._len_lines:

305

cur_line = self._raw_lines + line_index

306

if cur_line.flags & INDEXED != INDEXED:

307

continue

308

hash_offset = self._find_hash_position(cur_line)

309

310

# Point this line to the location in the hash table

311

cur_line.hash_offset = hash_offset

312

313

# Make this line the tail of the hash table

314

cur_bucket = self._hashtable + hash_offset

315

cur_bucket.count = cur_bucket.count + 1

316

if cur_bucket.line_index == SENTINEL:

317

cur_bucket.line_index = line_index

318

continue

319

# We need to track through the pointers and insert this at

320

# the end

321

next_line = self._raw_lines + cur_bucket.line_index

322

while next_line.next_line_index != SENTINEL:

323

next_line = self._raw_lines + next_line.next_line_index

324

next_line.next_line_index = line_index

325

326

cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:

327

"""Find the node in the hash which defines this line.

328

329

Each bucket in the hash table points at exactly 1 equivalent line. If 2

330

objects would collide, we just increment to the next bucket until we

331

get to an empty bucket that is either empty or exactly matches this

332

object.

333

334

:return: The offset in the hash table for this entry

335

"""

336

cdef Py_ssize_t location

337

cdef _raw_line *ref_line

338

cdef Py_ssize_t ref_index

339

cdef int compare_result

340

341

location = line.hash & self._hashtable_bitmask

342

ref_index = self._hashtable[location].line_index

343

while ref_index != SENTINEL:

344

ref_line = self._raw_lines + ref_index

345

if (ref_line.hash == line.hash):

346

PyObject_Cmp(ref_line.data, line.data, &compare_result)

347

if compare_result == 0:

348

break

349

location = (location + 1) & self._hashtable_bitmask

350

ref_index = self._hashtable[location].line_index

351

return location

352

353

def _py_find_hash_position(self, line):

354

"""Used only for testing.

355

356

Return the location where this fits in the hash table

357

"""

358

cdef _raw_line raw_line

359

360

self._line_to_raw_line(<PyObject *>line, &raw_line)

361

return self._find_hash_position(&raw_line)

362

363

def _inspect_left_lines(self):

364

"""Used only for testing.

365

366

:return: None if _raw_lines is NULL,

367

else [(object, hash_val, hash_loc, next_val)] for each node in raw

368

lines.

369

"""

370

cdef Py_ssize_t i

371

372

if self._raw_lines == NULL:

373

return None

374

375

result = []

376

for i from 0 <= i < self._len_lines:

377

PyList_Append(result,

378

(<object>self._raw_lines[i].data,

379

self._raw_lines[i].hash,

380

self._raw_lines[i].hash_offset,

381

self._raw_lines[i].next_line_index,

382

))

383

return result

384

385

def _inspect_hash_table(self):

386

"""Used only for testing.

387

388

This iterates the hash table, and returns 'interesting' entries.

389

:return: (total_size, [(offset, line_index, count)] for entries that

390

are not empty.

391

"""

392

cdef int i

393

394

interesting = []

395

for i from 0 <= i < self._hashtable_size:

396

if self._hashtable[i].line_index != SENTINEL:

397

PyList_Append(interesting,

398

(i, self._hashtable[i].line_index,

399

self._hashtable[i].count))

400

return (self._hashtable_size, interesting)

401

402

def get_matches(self, line):

403

"""Return the lines which match the line in right."""

404

cdef Py_ssize_t count

405

cdef Py_ssize_t i

406

cdef Py_ssize_t *matches

407

408

matches = NULL

409

410

try:

411

count = self.get_raw_matches(<PyObject *>line, &matches)

412

if count == 0:

413

return None

414

result = []

415

for i from 0 <= i < count:

416

PyList_Append(result, matches[i])

417

finally:

418

safe_free(<void**>&matches)

419

return result

420

421

cdef Py_ssize_t get_raw_matches(self, PyObject *line,

422

Py_ssize_t **matches) except -1:

423

"""Get all the possible entries that match the given location.

424

425

:param line: A Python line which we want to match

426

:param pos: The index position in _right_lines

427

:param matches: A variable to hold an array containing all the matches.

428

This will be allocated using malloc, and the caller is responsible

429

to free it with free(). If there are no matches, no memory will be

430

allocated.

431

:retrun: The number of matched positions

432

"""

433

cdef Py_ssize_t hash_offset

434

cdef _raw_line raw_line

435

cdef _hash_bucket cur_bucket

436

cdef Py_ssize_t cur_line_idx

437

cdef Py_ssize_t count

438

cdef Py_ssize_t *local_matches

439

440

self._line_to_raw_line(line, &raw_line)

441

hash_offset = self._find_hash_position(&raw_line)

442

cur_bucket = self._hashtable[hash_offset]

443

cur_line_idx = cur_bucket.line_index

444

if cur_line_idx == SENTINEL:

445

return 0

446

local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *

447

(cur_bucket.count + 1))

448

matches[0] = local_matches

449

for count from 0 <= count < cur_bucket.count:

450

assert cur_line_idx != SENTINEL

451

local_matches[count] = cur_line_idx

452

cur_line_idx = self._raw_lines[cur_line_idx].next_line_index

453

return cur_bucket.count

454

455

def _get_matching_lines(self):

456

"""Return a dictionary showing matching lines."""

457

matching = {}

458

for line in self.lines:

459

matching[line] = self.get_matches(line)

460

return matching

461

462

def get_idx_matches(self, right_idx):

463

"""Return the left lines matching the right line at the given offset."""

464

line = self._right_lines[right_idx]

465

return self.get_matches(line)

466

467

def extend_lines(self, lines, index):

468

"""Add more lines to the left-lines list.

469

470

:param lines: A list of lines to add

471

:param index: A True/False for each node to define if it should be

472

indexed.

473

"""

474

cdef Py_ssize_t orig_len

475

cdef Py_ssize_t min_new_hash_size

476

assert len(lines) == len(index)

477

min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +

478

len(lines))

479

orig_len = len(self.lines)

480

self.lines.extend(lines)

481

self._extend_raw_lines(index)

482

if self._hashtable_size >= min_new_hash_size:

483

# Just add the new lines, don't bother resizing the hash table

484

self._extend_hash_table(len(index))

485

return

486

self._build_hash_table()

487

488

def set_right_lines(self, lines):

489

"""Set the lines we will be matching against."""

490

self._right_lines = lines

491

492

493

cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,

494

Py_ssize_t *new_values, Py_ssize_t new_count) except -1:

495

"""Intersect the base values with the new values.

496

497

It is assumed that both base and new_values are sorted in ascending order,

498

with no duplicates.

499

500

The values in 'base' will be modified to only contain the entries that are

501

in both base and new. If nothing matches, 'base' will not be modified

502

503

:param base: An array of base values

504

:param base_count: The length of the base array

505

:param new_values: An array of values to compare to base, will be modified

506

to only contain matching entries

507

:param new_count: The number of new entries

508

:return: The count of matching values

509

"""

510

cdef Py_ssize_t *cur_base

511

cdef Py_ssize_t *cur_new

512

cdef Py_ssize_t *cur_out

513

cdef Py_ssize_t count

514

cdef Py_ssize_t *base_end

515

cdef Py_ssize_t *new_end

516

517

cur_base = base

518

cur_out = base

519

cur_new = new_values

520

base_end = base + base_count

521

new_end = new_values + new_count

522

count = 0

523

524

while cur_base < base_end and cur_new < new_end:

525

if cur_base[0] == cur_new[0]:

526

# This may be assigning to self.

527

cur_out[0] = cur_base[0]

528

count = count + 1

529

cur_out = cur_out + 1

530

# We matched, so move both pointers

531

cur_base = cur_base + 1

532

cur_new = cur_new + 1

533

elif cur_base[0] < cur_new[0]:

534

# cur_base is "behind" so increment it

535

cur_base = cur_base + 1

536

else:

537

# cur_new must be "behind" move its pointer

538

cur_new = cur_new + 1

539

return count

540

541

542

cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):

543

cdef int i

544

lst = []

545

for i from 0 <= i < count:

546

lst.append(array[i])

547

return lst

548

549

550

def _get_longest_match(equivalence_table, pos, max_pos, locations):

551

"""Get the longest possible match for the current position."""

552

cdef EquivalenceTable eq

553

cdef Py_ssize_t cpos

554

cdef Py_ssize_t cmax_pos

555

cdef Py_ssize_t *matches

556

cdef Py_ssize_t match_count

557

cdef PyObject *line

558

cdef Py_ssize_t *copy_ends

559

cdef Py_ssize_t copy_count

560

cdef Py_ssize_t range_len

561

cdef Py_ssize_t in_start

562

cdef Py_ssize_t i

563

cdef Py_ssize_t intersect_count

564

565

eq = equivalence_table

566

cpos = pos

567

cmax_pos = max_pos

568

range_start = pos

569

range_len = 0

570

matches = NULL

571

match_count = 0

572

copy_ends = NULL

573

copy_count = 0

574

575

# TODO: Handle when locations is not None

576

while cpos < cmax_pos:

577

# TODO: Instead of grabbing the full list of matches and then doing an

578

# intersection, use a helper that does the intersection

579

# as-it-goes.

580

line = PyList_GET_ITEM(eq._right_lines, cpos)

581

safe_free(<void**>&matches)

582

match_count = eq.get_raw_matches(line, &matches)

583

if match_count == 0:

584

# No more matches, just return whatever we have, but we know that

585

# this last position is not going to match anything

586

cpos = cpos + 1

587

break

588

else:

589

if copy_ends == NULL:

590

# We are starting a new range, just steal the matches

591

copy_ends = matches

592

copy_count = match_count

593

matches = NULL

594

match_count = 0

595

for i from 0 <= i < copy_count:

596

copy_ends[i] = copy_ends[i] + 1

597

range_len = 1

598

else:

599

# We are currently in the middle of a match

600

intersect_count = intersect_sorted(copy_ends, copy_count,

601

matches, match_count)

602

if intersect_count == 0:

603

# But we are done with this match, we should be

604

# starting a new one, though. We will pass back 'locations'

605

# so that we don't have to do another lookup.

606

break

607

else:

608

copy_count = intersect_count

609

# range continues, step the copy ends

610

for i from 0 <= i < copy_count:

611

copy_ends[i] = copy_ends[i] + 1

612

range_len = range_len + 1

613

safe_free(<void **>&matches)

614

match_count = 0

615

cpos = cpos + 1

616

if matches == NULL:

617

# We consumed all of our matches

618

locations = None

619

else:

620

locations = array_as_list(matches, match_count)

621

safe_free(<void **>&matches)

622

if copy_ends == NULL or copy_count == 0:

623

# We never had a match

624

return None, cpos, locations

625

# Because copy_ends is always sorted, we know the 'min' is just the first

626

# node

627

in_start = copy_ends[0]

628

safe_free(<void**>&copy_ends)

629

return ((in_start - range_len), range_start, range_len), cpos, locations