~bzr-pqm/bzr/bzr.dev

Viewing changes to _groupcompress_pyx.pyx

Committer: John Arbash Meinel
Date: 2009-03-04 16:01:55 UTC
mfrom: (0.23.59 rabin)
mto: (0.17.34 trunk) (3735.31.1 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090304160155-66iy2jorb5h39n6d

Bring in the 'rabin' experiment.
Change the names and disk-strings for the various repository formats.
Make the CHK format repositories all 'rich-root' we can introduce non-rich-root later.
Make a couple other small tweaks, like copyright statements, etc.
Remove patch-delta.c, at this point, it was only a reference implementation,
as we have fully integrated the patching into pyrex, to allow nicer exception
handling.

files added:
delta.h

diff-delta.c

files renamed:
_groupcompress_c.pyx => _groupcompress_pyx.pyx

tests/test__groupcompress_c.py => tests/test__groupcompress_pyx.py

files modified:
.bzrignore

TODO

__init__.py

groupcompress.py

repofmt.py

setup.py

tests/__init__.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

_groupcompress_pyx.pyx

void * malloc(size_t)

void * realloc(void *, size_t)

void free(void *)

void memcpy(void *, void *, size_t)

cdef extern from "delta.h":

struct source_info:

void *buf

unsigned long size

unsigned long agg_offset

struct delta_index:

pass

delta_index * create_delta_index(source_info *src, delta_index *old)

delta_index * create_delta_index_from_delta(source_info *delta,

delta_index *old)

void free_delta_index(delta_index *index)

void *create_delta(delta_index *indexes,

void *buf, unsigned long bufsize,

unsigned long *delta_size, unsigned long max_delta_size)

unsigned long get_delta_hdr_size(unsigned char **datap,

unsigned char *top)

Py_ssize_t DELTA_SIZE_MIN

void *patch_delta(void *src_buf, unsigned long src_size,

void *delta_buf, unsigned long delta_size,

unsigned long *dst_size)

cdef extern from "Python.h":

struct _PyObject:

pass

ctypedef _PyObject PyObject

PyObject *PySequence_Fast(object, char *) except NULL

Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)

PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)

PyObject *PyList_GET_ITEM(object, Py_ssize_t)

int PyList_Append(object, object) except -1

long PyObject_Hash(PyObject *) except -1

# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check

# if there is an exception *for* us.

int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1

int PyObject_Not(PyObject *) except -1

void Py_DECREF(PyObject *)

void Py_INCREF(PyObject *)

cdef enum _raw_line_flags:

INDEXED = 0x01

cdef struct _raw_line:

long hash # Cached form of the hash for this entry

Py_ssize_t hash_offset # The location in the hash table for this object

Py_ssize_t next_line_index # Next line which is equivalent to this one

int flags # status flags

PyObject *data # Raw pointer to the original line

cdef struct _hash_bucket:

Py_ssize_t line_index # First line in the left side for this bucket

Py_ssize_t count # Number of equivalent lines, DO we even need this?

cdef Py_ssize_t SENTINEL

SENTINEL = -1

int PyString_CheckExact(object)

char * PyString_AS_STRING(object)

Py_ssize_t PyString_GET_SIZE(object)

object PyString_FromStringAndSize(char *, Py_ssize_t)

cdef void *safe_malloc(size_t count) except NULL:

free(val[0])

val[0] = NULL

cdef class EquivalenceTable:

"""This tracks equivalencies between lists of hashable objects.

:ivar lines: The 'static' lines that will be preserved between runs.

"""

cdef readonly object lines

cdef readonly object _right_lines

cdef Py_ssize_t _hashtable_size

cdef Py_ssize_t _hashtable_bitmask

cdef _hash_bucket *_hashtable

cdef _raw_line *_raw_lines

cdef Py_ssize_t _len_lines

100

def __init__(self, lines):

101

self.lines = list(lines)

102

self._right_lines = None

103

self._hashtable_size = 0

104

self._hashtable = NULL

105

self._raw_lines = NULL

106

self._len_lines = 0

107

self._lines_to_raw_lines(lines)

108

self._build_hash_table()

def make_delta_index(source):

return DeltaIndex(source)

cdef class DeltaIndex:

# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object

# isn't performance critical

# cdef readonly list _sources

cdef readonly object _sources

cdef source_info *_source_infos

cdef delta_index *_index

cdef readonly unsigned int _max_num_sources

cdef public unsigned long _source_offset

def __repr__(self):

return '%s(%d, %d)' % (self.__class__.__name__,

len(self._sources), self._source_offset)

def __init__(self, source=None):

self._sources = []

self._index = NULL

self._max_num_sources = 65000

100

self._source_infos = <source_info *>safe_malloc(sizeof(source_info)

101

* self._max_num_sources)

102

self._source_offset = 0

103

104

if source is not None:

105

self.add_source(source, 0)

109

106

110

107

def __dealloc__(self):

111

safe_free(<void**>&self._hashtable)

112

safe_free(<void**>&self._raw_lines)

113

114

cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:

115

"""Convert a single PyObject into a raw line."""

116

raw_line.hash = PyObject_Hash(line)

117

raw_line.next_line_index = SENTINEL

118

raw_line.hash_offset = SENTINEL

119

raw_line.flags = INDEXED

120

raw_line.data = line

121

122

cdef int _lines_to_raw_lines(self, object lines) except -1:

123

"""Load a sequence of objects into the _raw_line format"""

124

cdef Py_ssize_t count, i

125

cdef PyObject *seq

126

cdef PyObject *item

127

cdef _raw_line *raw

128

129

# Do we want to use PySequence_Fast, or just assume that it is a list

130

# Also, do we need to decref the return value?

131

# http://www.python.org/doc/current/api/sequence.html

132

seq = PySequence_Fast(lines, "expected a sequence")

133

try:

134

count = PySequence_Fast_GET_SIZE(seq)

135

if count == 0:

136

return 0

137

raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))

138

safe_free(<void**>&self._raw_lines)

139

self._raw_lines = raw

140

self._len_lines = count

141

142

for i from 0 <= i < count:

143

item = PySequence_Fast_GET_ITEM(seq, i)

144

# NB: We don't Py_INCREF the data pointer, because we know we

145

# maintain a pointer to the item in self.lines or

146

# self._right_lines

147

# TODO: Do we even need to track a data pointer here? It would

148

# be less memory, and we *could* just look it up in the

149

# appropriate line list.

150

self._line_to_raw_line(item, &raw[i])

151

finally:

152

# TODO: Unfortunately, try/finally always generates a compiler

153

# warning about a possibly unused variable :(

154

Py_DECREF(seq)

155

return count

156

157

cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):

158

"""Determine the smallest hash size that can reasonably fit the data.

159

160

:param needed: The number of entries we might be inserting into

161

the hash (assuming there are no duplicate lines.)

162

:return: The number of hash table entries to use. This will always be a

163

power of two.

164

"""

165

cdef Py_ssize_t hash_size

166

cdef Py_ssize_t min_size

167

168

# TODO: Another alternative would be to actually count how full the

169

# hash-table is, and decide if we need to grow it based on

170

# density. That can take into account duplicated lines. Though if

171

# we compress well, there should be a minimal amount of

172

# duplicated lines in the output.

173

174

# At the bare minimum, we could fit all entries into a 'needed'

175

# size hash table. However, any collision would then have a long way to

176

# traverse before it could find a 'free' slot.

177

# So we set the minimum size to give us 33% empty slots.

178

min_size = <Py_ssize_t>(needed * 2)

179

hash_size = 1

180

while hash_size < min_size:

181

hash_size = hash_size << 1

182

return hash_size

183

184

def _py_compute_minimum_hash_size(self, needed):

185

"""Expose _compute_minimum_hash_size to python for testing."""

186

return self._compute_minimum_hash_size(needed)

187

188

cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):

189

"""Determine a reasonable hash size, assuming some room for growth.

190

191

:param needed: The number of entries we might be inserting into

192

the hash (assuming there are no duplicate lines.)

193

:return: The number of hash table entries to use. This will always be a

194

power of two.

195

"""

196

cdef Py_ssize_t hash_size

197

cdef Py_ssize_t min_size

198

199

# We start off with a 8k hash (after doubling), because there isn't a

200

# lot of reason to go smaller than that (this class isn't one you'll be

201

# instantiating thousands of, and you are always likely to grow here.)

202

hash_size = 2048

203

while hash_size < needed:

204

hash_size = hash_size << 1

205

# And we always give at least 4x blank space

206

hash_size = hash_size << 2

207

return hash_size

208

209

def _py_compute_recommended_hash_size(self, needed):

210

"""Expose _compute_recommended_hash_size to python for testing."""

211

return self._compute_recommended_hash_size(needed)

212

213

cdef int _build_hash_table(self) except -1:

214

"""Build the hash table 'from scratch'."""

215

cdef Py_ssize_t hash_size

216

cdef Py_ssize_t hash_bitmask

217

cdef Py_ssize_t i

218

cdef _raw_line *cur_line

219

cdef Py_ssize_t hash_offset

220

cdef _hash_bucket *cur_bucket

221

cdef _hash_bucket *new_hashtable

222

223

# Hash size is a power of 2

224

hash_size = self._compute_recommended_hash_size(self._len_lines)

225

226

new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *

227

hash_size)

228

safe_free(<void**>&self._hashtable)

229

self._hashtable = new_hashtable

230

231

self._hashtable_size = hash_size

232

for i from 0 <= i < hash_size:

233

self._hashtable[i].line_index = SENTINEL

234

self._hashtable[i].count = 0

235

236

# Turn the hash size into a bitmask

237

self._hashtable_bitmask = hash_size - 1

238

239

# Iterate backwards, because it makes it easier to insert items into

240

# the hash (you just change the head pointer, and everything else keeps

241

# pointing to the same location).

242

for i from self._len_lines > i >= 0:

243

cur_line = self._raw_lines + i

244

if not (cur_line.flags & INDEXED == INDEXED):

245

continue

246

hash_offset = self._find_hash_position(cur_line)

247

248

# Point this line to the location in the hash table

249

cur_line.hash_offset = hash_offset

250

# And make this line the head of the hash table

251

cur_bucket = self._hashtable + hash_offset

252

cur_line.next_line_index = cur_bucket.line_index

253

cur_bucket.line_index = i

254

cur_bucket.count = cur_bucket.count + 1

255

256

cdef int _extend_raw_lines(self, object index) except -1:

257

"""Add the last N lines from self.lines into the raw_lines array."""

258

cdef Py_ssize_t new_count

259

cdef Py_ssize_t new_total_len

260

cdef Py_ssize_t old_len

261

cdef PyObject *item

262

cdef PyObject *should_index

263

cdef Py_ssize_t i

264

cdef Py_ssize_t line_index

265

cdef _raw_line *cur_line

266

cdef PyObject *seq_index

267

cdef Py_ssize_t actual_new_len

268

269

seq_index = PySequence_Fast(index, "expected a sequence for index")

270

try:

271

old_len = self._len_lines

272

new_count = PySequence_Fast_GET_SIZE(seq_index)

273

new_total_len = new_count + self._len_lines

274

actual_new_len = 1

275

while actual_new_len < new_total_len:

276

actual_new_len = actual_new_len << 1

277

self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,

278

actual_new_len * sizeof(_raw_line))

279

self._len_lines = new_total_len

280

# Now that we have enough space, start adding the new lines

281

# into the array. These are done in forward order.

282

for i from 0 <= i < new_count:

283

line_index = i + old_len

284

cur_line = self._raw_lines + line_index

285

item = PyList_GET_ITEM(self.lines, line_index)

286

self._line_to_raw_line(item, cur_line)

287

should_index = PySequence_Fast_GET_ITEM(seq_index, i)

288

if PyObject_Not(should_index):

289

cur_line.flags = cur_line.flags & ~(<int>INDEXED)

290

finally:

291

Py_DECREF(seq_index)

292

293

cdef int _extend_hash_table(self, Py_ssize_t count) except -1:

294

"""Add the last N entries in self.lines to the hash table.

295

296

:param index: A sequence that declares whether each node should be

297

INDEXED or not.

298

"""

299

cdef Py_ssize_t line_index

300

cdef Py_ssize_t start

301

cdef _raw_line *cur_line

302

cdef _raw_line *next_line

303

cdef Py_ssize_t hash_offset

304

cdef _hash_bucket *cur_bucket

305

306

start = self._len_lines - count

307

308

for line_index from start <= line_index < self._len_lines:

309

cur_line = self._raw_lines + line_index

310

if cur_line.flags & INDEXED != INDEXED:

311

continue

312

hash_offset = self._find_hash_position(cur_line)

313

314

# Point this line to the location in the hash table

315

cur_line.hash_offset = hash_offset

316

317

# Make this line the tail of the hash table

318

cur_bucket = self._hashtable + hash_offset

319

cur_bucket.count = cur_bucket.count + 1

320

if cur_bucket.line_index == SENTINEL:

321

cur_bucket.line_index = line_index

322

continue

323

# We need to track through the pointers and insert this at

324

# the end

325

next_line = self._raw_lines + cur_bucket.line_index

326

while next_line.next_line_index != SENTINEL:

327

next_line = self._raw_lines + next_line.next_line_index

328

next_line.next_line_index = line_index

329

330

cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:

331

"""Find the node in the hash which defines this line.

332

333

Each bucket in the hash table points at exactly 1 equivalent line. If 2

334

objects would collide, we just increment to the next bucket until we

335

get to an empty bucket that is either empty or exactly matches this

336

object.

337

338

:return: The offset in the hash table for this entry

339

"""

340

cdef Py_ssize_t location

341

cdef _raw_line *ref_line

342

cdef Py_ssize_t ref_index

343

cdef int compare_result

344

345

location = line.hash & self._hashtable_bitmask

346

ref_index = self._hashtable[location].line_index

347

while ref_index != SENTINEL:

348

ref_line = self._raw_lines + ref_index

349

if (ref_line.hash == line.hash):

350

PyObject_Cmp(ref_line.data, line.data, &compare_result)

351

if compare_result == 0:

352

break

353

location = (location + 1) & self._hashtable_bitmask

354

ref_index = self._hashtable[location].line_index

355

return location

356

357

def _py_find_hash_position(self, line):

358

"""Used only for testing.

359

360

Return the location where this fits in the hash table

361

"""

362

cdef _raw_line raw_line

363

364

self._line_to_raw_line(<PyObject *>line, &raw_line)

365

return self._find_hash_position(&raw_line)

366

367

def _inspect_left_lines(self):

368

"""Used only for testing.

369

370

:return: None if _raw_lines is NULL,

371

else [(object, hash_val, hash_loc, next_val)] for each node in raw

372

lines.

373

"""

374

cdef Py_ssize_t i

375

376

if self._raw_lines == NULL:

108

if self._index != NULL:

109

free_delta_index(self._index)

110

self._index = NULL

111

safe_free(<void **>&self._source_infos)

112

113

def add_delta_source(self, delta, unadded_bytes):

114

"""Add a new delta to the source texts.

115

116

:param delta: The text of the delta, this must be a byte string.

117

:param unadded_bytes: Number of bytes that were added to the source

118

that were not indexed.

119

"""

120

cdef char *c_delta

121

cdef Py_ssize_t c_delta_size

122

cdef delta_index *index

123

cdef unsigned int source_location

124

cdef source_info *src

125

cdef unsigned int num_indexes

126

127

if not PyString_CheckExact(delta):

128

raise TypeError('delta is not a str')

129

130

source_location = len(self._sources)

131

if source_location >= self._max_num_sources:

132

self._expand_sources()

133

self._sources.append(delta)

134

c_delta = PyString_AS_STRING(delta)

135

c_delta_size = PyString_GET_SIZE(delta)

136

src = self._source_infos + source_location

137

src.buf = c_delta

138

src.size = c_delta_size

139

src.agg_offset = self._source_offset + unadded_bytes

140

index = create_delta_index_from_delta(src, self._index)

141

self._source_offset = src.agg_offset + src.size

142

if index != NULL:

143

free_delta_index(self._index)

144

self._index = index

145

146

def add_source(self, source, unadded_bytes):

147

"""Add a new bit of source text to the delta indexes.

148

149

:param source: The text in question, this must be a byte string

150

:param unadded_bytes: Assume there are this many bytes that didn't get

151

added between this source and the end of the previous source.

152

"""

153

cdef char *c_source

154

cdef Py_ssize_t c_source_size

155

cdef delta_index *index

156

cdef unsigned int source_location

157

cdef source_info *src

158

cdef unsigned int num_indexes

159

160

if not PyString_CheckExact(source):

161

raise TypeError('source is not a str')

162

163

source_location = len(self._sources)

164

if source_location >= self._max_num_sources:

165

self._expand_sources()

166

self._sources.append(source)

167

c_source = PyString_AS_STRING(source)

168

c_source_size = PyString_GET_SIZE(source)

169

src = self._source_infos + source_location

170

src.buf = c_source

171

src.size = c_source_size

172

173

src.agg_offset = self._source_offset + unadded_bytes

174

index = create_delta_index(src, self._index)

175

self._source_offset = src.agg_offset + src.size

176

if index != NULL:

177

free_delta_index(self._index)

178

self._index = index

179

180

cdef _expand_sources(self):

181

raise RuntimeError('if we move self._source_infos, then we need to'

182

' change all of the index pointers as well.')

183

self._max_num_sources = self._max_num_sources * 2

184

self._source_infos = <source_info *>safe_realloc(self._source_infos,

185

sizeof(source_info)

186

* self._max_num_sources)

187

188

def make_delta(self, target_bytes, max_delta_size=0):

189

"""Create a delta from the current source to the target bytes."""

190

cdef char *target

191

cdef Py_ssize_t target_size

192

cdef void * delta

193

cdef unsigned long delta_size

194

195

if self._index == NULL:

377

196

return None

378

197

379

result = []

380

for i from 0 <= i < self._len_lines:

381

PyList_Append(result,

382

(<object>self._raw_lines[i].data,

383

self._raw_lines[i].hash,

384

self._raw_lines[i].hash_offset,

385

self._raw_lines[i].next_line_index,

386

))

387

return result

388

389

def _inspect_hash_table(self):

390

"""Used only for testing.

391

392

This iterates the hash table, and returns 'interesting' entries.

393

:return: (total_size, [(offset, line_index, count)] for entries that

394

are not empty.

395

"""

396

cdef int i

397

398

interesting = []

399

for i from 0 <= i < self._hashtable_size:

400

if self._hashtable[i].line_index != SENTINEL:

401

PyList_Append(interesting,

402

(i, self._hashtable[i].line_index,

403

self._hashtable[i].count))

404

return (self._hashtable_size, interesting)

405

406

def get_matches(self, line):

407

"""Return the lines which match the line in right."""

408

cdef Py_ssize_t count

409

cdef Py_ssize_t i

410

cdef Py_ssize_t *matches

411

412

matches = NULL

413

414

try:

415

count = self.get_raw_matches(<PyObject *>line, &matches)

416

if count == 0:

417

return None

418

result = []

419

for i from 0 <= i < count:

420

PyList_Append(result, matches[i])

421

finally:

422

safe_free(<void**>&matches)

423

return result

424

425

cdef Py_ssize_t get_raw_matches(self, PyObject *line,

426

Py_ssize_t **matches) except -1:

427

"""Get all the possible entries that match the given location.

428

429

:param line: A Python line which we want to match

430

:param pos: The index position in _right_lines

431

:param matches: A variable to hold an array containing all the matches.

432

This will be allocated using malloc, and the caller is responsible

433

to free it with free(). If there are no matches, no memory will be

434

allocated.

435

:retrun: The number of matched positions

436

"""

437

cdef Py_ssize_t hash_offset

438

cdef _raw_line raw_line

439

cdef _hash_bucket cur_bucket

440

cdef Py_ssize_t cur_line_idx

441

cdef Py_ssize_t count

442

cdef Py_ssize_t *local_matches

443

444

self._line_to_raw_line(line, &raw_line)

445

hash_offset = self._find_hash_position(&raw_line)

446

cur_bucket = self._hashtable[hash_offset]

447

cur_line_idx = cur_bucket.line_index

448

if cur_line_idx == SENTINEL:

449

return 0

450

local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *

451

(cur_bucket.count + 1))

452

matches[0] = local_matches

453

for count from 0 <= count < cur_bucket.count:

454

assert cur_line_idx != SENTINEL

455

local_matches[count] = cur_line_idx

456

cur_line_idx = self._raw_lines[cur_line_idx].next_line_index

457

return cur_bucket.count

458

459

cdef Py_ssize_t get_next_matches(self, PyObject *line,

460

Py_ssize_t *tips,

461

Py_ssize_t tip_count,

462

int *did_match) except -1:

463

"""Find matches that are interesting for the next step.

464

465

This finds offsets which match the current search tips. It is slightly

466

different from get_raw_matches, in that it intersects matching lines

467

against the existing tips, and then returns the *next* line.

468

469

TODO: It might be nice to know if there were 0 matches because the line

470

didn't match anything, or if it was just because it didn't follow

471

anything.

472

473

:param line: A line to match against

474

:param tips: A list of matches that we already have. This list

475

will be updated "in place". If there are no new matches, the list

476

will go unmodified, and the returned count will be 0.

477

:param tip_count: The current length of tips

478

:param did_match: Will be set to True if the line had matches, else 0

479

:return: The new number of matched lines.

480

"""

481

cdef Py_ssize_t hash_offset

482

cdef _raw_line raw_line

483

cdef _hash_bucket cur_bucket

484

cdef Py_ssize_t cur_line_idx

485

cdef Py_ssize_t count

486

cdef Py_ssize_t *cur_tip

487

cdef Py_ssize_t *end_tip

488

cdef Py_ssize_t *cur_out

489

490

self._line_to_raw_line(line, &raw_line)

491

hash_offset = self._find_hash_position(&raw_line)

492

cur_bucket = self._hashtable[hash_offset]

493

cur_line_idx = cur_bucket.line_index

494

if cur_line_idx == SENTINEL:

495

did_match[0] = 0

496

return 0

497

498

did_match[0] = 1

499

500

cur_tip = tips

501

end_tip = tips + tip_count

502

cur_out = tips

503

count = 0

504

while cur_tip < end_tip and cur_line_idx != SENTINEL:

505

if cur_line_idx == cur_tip[0]:

506

# These match, so put a new entry in the output

507

# We put the *next* line, so that this is ready to pass back in

508

# for the next search.

509

cur_out[0] = (cur_line_idx + 1)

510

count = count + 1

511

cur_out = cur_out + 1

512

cur_tip = cur_tip + 1

513

cur_line_idx = self._raw_lines[cur_line_idx].next_line_index

514

elif cur_line_idx < cur_tip[0]:

515

# cur_line_idx is "behind"

516

cur_line_idx = self._raw_lines[cur_line_idx].next_line_index

517

else:

518

cur_tip = cur_tip + 1

519

return count

520

521

def _get_matching_lines(self):

522

"""Return a dictionary showing matching lines."""

523

matching = {}

524

for line in self.lines:

525

matching[line] = self.get_matches(line)

526

return matching

527

528

def get_idx_matches(self, right_idx):

529

"""Return the left lines matching the right line at the given offset."""

530

line = self._right_lines[right_idx]

531

return self.get_matches(line)

532

533

def extend_lines(self, lines, index):

534

"""Add more lines to the left-lines list.

535

536

:param lines: A list of lines to add

537

:param index: A True/False for each node to define if it should be

538

indexed.

539

"""

540

cdef Py_ssize_t orig_len

541

cdef Py_ssize_t min_new_hash_size

542

assert len(lines) == len(index)

543

min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +

544

len(lines))

545

orig_len = len(self.lines)

546

self.lines.extend(lines)

547

self._extend_raw_lines(index)

548

if self._hashtable_size >= min_new_hash_size:

549

# Just add the new lines, don't bother resizing the hash table

550

self._extend_hash_table(len(index))

551

return

552

self._build_hash_table()

553

554

def set_right_lines(self, lines):

555

"""Set the lines we will be matching against."""

556

self._right_lines = lines

557

558

559

cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,

560

Py_ssize_t *new_values, Py_ssize_t new_count) except -1:

561

"""Intersect the base values with the new values.

562

563

It is assumed that both base and new_values are sorted in ascending order,

564

with no duplicates.

565

566

The values in 'base' will be modified to only contain the entries that are

567

in both base and new. If nothing matches, 'base' will not be modified

568

569

:param base: An array of base values

570

:param base_count: The length of the base array

571

:param new_values: An array of values to compare to base, will be modified

572

to only contain matching entries

573

:param new_count: The number of new entries

574

:return: The count of matching values

575

"""

576

cdef Py_ssize_t *cur_base

577

cdef Py_ssize_t *cur_new

578

cdef Py_ssize_t *cur_out

579

cdef Py_ssize_t count

580

cdef Py_ssize_t *base_end

581

cdef Py_ssize_t *new_end

582

583

cur_base = base

584

cur_out = base

585

cur_new = new_values

586

base_end = base + base_count

587

new_end = new_values + new_count

588

count = 0

589

590

while cur_base < base_end and cur_new < new_end:

591

if cur_base[0] == cur_new[0]:

592

# This may be assigning to self.

593

cur_out[0] = cur_base[0]

594

count = count + 1

595

cur_out = cur_out + 1

596

# We matched, so move both pointers

597

cur_base = cur_base + 1

598

cur_new = cur_new + 1

599

elif cur_base[0] < cur_new[0]:

600

# cur_base is "behind" so increment it

601

cur_base = cur_base + 1

602

else:

603

# cur_new must be "behind" move its pointer

604

cur_new = cur_new + 1

605

return count

606

607

608

cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):

609

cdef int i

610

lst = []

611

for i from 0 <= i < count:

612

PyList_Append(lst, array[i])

613

return lst

614

615

616

cdef Py_ssize_t list_as_array(object lst, Py_ssize_t **array) except -1:

617

"""Convert a Python sequence to an integer array.

618

619

:return: The size of the output, -1 on error

620

"""

621

cdef int i

622

cdef PyObject *seq

198

if not PyString_CheckExact(target_bytes):

199

raise TypeError('target is not a str')

200

201

target = PyString_AS_STRING(target_bytes)

202

target_size = PyString_GET_SIZE(target_bytes)

203

204

# TODO: inline some of create_delta so we at least don't have to double

205

# malloc, and can instead use PyString_FromStringAndSize, to

206

# allocate the bytes into the final string

207

delta = create_delta(self._index,

208

target, target_size,

209

&delta_size, max_delta_size)

210

result = None

211

if delta:

212

result = PyString_FromStringAndSize(<char *>delta, delta_size)

213

free(delta)

214

return result

215

216

217

def make_delta(source_bytes, target_bytes):

218

"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""

219

di = DeltaIndex(source_bytes)

220

return di.make_delta(target_bytes)

221

222

223

def apply_delta(source_bytes, delta_bytes):

224

"""Apply a delta generated by make_delta to source_bytes."""

225

cdef char *source

226

cdef Py_ssize_t source_size

227

cdef char *delta

228

cdef Py_ssize_t delta_size

229

cdef unsigned char *data, *top

230

cdef unsigned char *dst_buf, *out, cmd

623

231

cdef Py_ssize_t size

624

625

seq = PySequence_Fast(lst, "expected a sequence for lst")

626

try:

627

size = PySequence_Fast_GET_SIZE(seq)

628

array[0] = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * size)

629

for i from 0 <= i < size:

630

array[0][i] = <object>PySequence_Fast_GET_ITEM(seq, i)

631

finally:

632

Py_DECREF(seq)

633

return size

634

635

636

def _get_longest_match(equivalence_table, pos, max_pos, locations):

637

"""Get the longest possible match for the current position."""

638

cdef EquivalenceTable eq

639

cdef Py_ssize_t cpos

640

cdef Py_ssize_t cmax_pos

641

cdef Py_ssize_t *matches

642

cdef Py_ssize_t match_count

643

cdef PyObject *line

644

cdef Py_ssize_t *copy_ends

645

cdef Py_ssize_t copy_count

646

cdef Py_ssize_t range_len

647

cdef Py_ssize_t in_start

648

cdef Py_ssize_t i

649

cdef Py_ssize_t intersect_count

650

cdef int did_match

651

652

eq = equivalence_table

653

cpos = pos

654

cmax_pos = max_pos

655

range_start = pos

656

range_len = 0

657

matches = NULL

658

match_count = 0

659

copy_ends = NULL

660

copy_count = 0

661

662

# TODO: Handle when locations is not None

663

# if locations is not None:

664

# copy_count = list_as_array(locations, &copy_ends)

665

# # We know the match for this position

666

# cpos = cpos + 1

667

# range_len = 1

668

while cpos < cmax_pos:

669

# TODO: Instead of grabbing the full list of matches and then doing an

670

# intersection, use a helper that does the intersection

671

# as-it-goes.

672

line = PyList_GET_ITEM(eq._right_lines, cpos)

673

if copy_ends == NULL:

674

copy_count = eq.get_raw_matches(line, &copy_ends)

675

if copy_count == 0:

676

# No matches for this line

677

cpos = cpos + 1

678

break

679

# point at the next location, for the next round trip

680

range_len = 1

681

for i from 0 <= i < copy_count:

682

copy_ends[i] = copy_ends[i] + 1

232

cdef unsigned long cp_off, cp_size

233

234

if not PyString_CheckExact(source_bytes):

235

raise TypeError('source is not a str')

236

if not PyString_CheckExact(delta_bytes):

237

raise TypeError('delta is not a str')

238

239

source = PyString_AS_STRING(source_bytes)

240

source_size = PyString_GET_SIZE(source_bytes)

241

delta = PyString_AS_STRING(delta_bytes)

242

delta_size = PyString_GET_SIZE(delta_bytes)

243

244

# Code taken from patch-delta.c, only brought here to give better error

245

# handling, and to avoid double allocating memory

246

if (delta_size < DELTA_SIZE_MIN):

247

# XXX: Invalid delta block

248

raise RuntimeError('delta_size %d smaller than min delta size %d'

249

% (delta_size, DELTA_SIZE_MIN))

250

251

data = <unsigned char *>delta

252

top = data + delta_size

253

254

# make sure the orig file size matches what we expect

255

# XXX: gcc warns because data isn't defined as 'const'

256

size = get_delta_hdr_size(&data, top)

257

if (size > source_size):

258

# XXX: mismatched source size

259

raise RuntimeError('source size %d < expected source size %d'

260

% (source_size, size))

261

source_size = size

262

263

# now the result size

264

size = get_delta_hdr_size(&data, top)

265

result = PyString_FromStringAndSize(NULL, size)

266

dst_buf = <unsigned char*>PyString_AS_STRING(result)

267

# XXX: The original code added a trailing null here, but this shouldn't be

268

# necessary when using PyString_FromStringAndSize

269

# dst_buf[size] = 0

270

271

out = dst_buf

272

while (data < top):

273

cmd = data[0]

274

data = data + 1

275

if (cmd & 0x80):

276

cp_off = cp_size = 0

277

if (cmd & 0x01):

278

cp_off = data[0]

279

data = data + 1

280

if (cmd & 0x02):

281

cp_off = cp_off | (data[0] << 8)

282

data = data + 1

283

if (cmd & 0x04):

284

cp_off = cp_off | (data[0] << 16)

285

data = data + 1

286

if (cmd & 0x08):

287

cp_off = cp_off | (data[0] << 24)

288

data = data + 1

289

if (cmd & 0x10):

290

cp_size = data[0]

291

data = data + 1

292

if (cmd & 0x20):

293

cp_size = cp_size | (data[0] << 8)

294

data = data + 1

295

if (cmd & 0x40):

296

cp_size = cp_size | (data[0] << 16)

297

data = data + 1

298

if (cp_size == 0):

299

cp_size = 0x10000

300

if (cp_off + cp_size < cp_size or

301

cp_off + cp_size > source_size or

302

cp_size > size):

303

raise RuntimeError('Something wrong with:'

304

' cp_off = %s, cp_size = %s'

305

' source_size = %s, size = %s'

306

% (cp_off, cp_size, source_size, size))

307

memcpy(out, source + cp_off, cp_size)

308

out = out + cp_size

309

size = size - cp_size

310

elif (cmd):

311

if (cmd > size):

312

raise RuntimeError('Insert instruction longer than remaining'

313

' bytes: %d > %d' % (cmd, size))

314

memcpy(out, data, cmd)

315

out = out + cmd

316

data = data + cmd

317

size = size - cmd

683

318

else:

684

# We are in the middle of a match

685

intersect_count = eq.get_next_matches(line, copy_ends, copy_count,

686

&did_match)

687

if intersect_count == 0:

688

# No more matches

689

if not did_match:

690

# because the next line is not interesting

691

cpos = cpos + 1

692

break

693

else:

694

copy_count = intersect_count

695

range_len = range_len + 1

696

cpos = cpos + 1

697

if copy_ends == NULL or copy_count == 0:

698

# We never had a match

699

return None, cpos, None

700

# Because copy_ends is always sorted, we know the 'min' is just the first

701

# node

702

in_start = copy_ends[0]

703

safe_free(<void**>&copy_ends)

704

return ((in_start - range_len), range_start, range_len), cpos, None

319

# /*

320

# * cmd == 0 is reserved for future encoding

321

# * extensions. In the mean time we must fail when

322

# * encountering them (might be data corruption).

323

# */

324

## /* XXX: error("unexpected delta opcode 0"); */

325

raise RuntimeError('Got delta opcode: 0, not supported')

326

327

# /* sanity check */

328

if (data != top or size != 0):

329

## /* XXX: error("delta replay has gone wild"); */

330

raise RuntimeError('Did not extract the number of bytes we expected'

331

' we were left with %d bytes in "size", and top - data = %d'

332

% (size, <int>(top - data)))

333

return None

334

335

# *dst_size = out - dst_buf;

336

assert (out - dst_buf) == PyString_GET_SIZE(result)

337

return result

Older »