~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/_simple_set_pyx.pyx

Committer: John Arbash Meinel
Date: 2009-10-12 20:12:32 UTC
mfrom: (4736 +trunk)
mto: This revision was merged to the branch mainline in revision 4749.
Revision ID: john@arbash-meinel.com-20091012201232-ccnceqvk29u7v22u

Merge bzr.dev 4736

This includes the updates to StaticTuple, etc, to build using Pyrex 0.9.6.4

files modified:
NEWS

bzrlib/_simple_set_pyx.pxd

bzrlib/_simple_set_pyx.pyx

bzrlib/_static_tuple_c.c

bzrlib/_static_tuple_py.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle/apply_bundle.py

bzrlib/decorators.py

bzrlib/diff.py

bzrlib/foreign.py

bzrlib/help_topics/en/debug-flags.txt

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/merge.py

bzrlib/mutabletree.py

bzrlib/python-compat.h

bzrlib/reconfigure.py

bzrlib/remote.py

bzrlib/repofmt/pack_repo.py

bzrlib/repository.py

bzrlib/send.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/lock_helpers.py

bzrlib/tests/per_branch/test_locking.py

bzrlib/tests/per_repository/test_write_group.py

bzrlib/tests/test__simple_set.py

bzrlib/tests/test__static_tuple.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_mutabletree.py

bzrlib/tests/test_reconfigure.py

bzrlib/tests/test_status.py

bzrlib/workingtree.py

doc/developers/HACKING.txt

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/_simple_set_pyx.pyx

"""Definition of a class that is similar to Set with some small changes."""

cdef extern from "python-compat.h":

pass

cdef extern from "Python.h":

ctypedef unsigned long size_t

ctypedef long (*hashfunc)(PyObject*)

traverseproc tp_traverse

PyTypeObject *Py_TYPE(PyObject *)

int PyObject_IsTrue(PyObject *)

void *PyMem_Malloc(size_t nbytes)

void PyMem_Free(void *)

void memset(void *, int, size_t)

# Dummy is an object used to mark nodes that have been deleted. Since

# collisions require us to move a node to an alternative location, if we just

# set an entry to NULL on delete, we won't find any relocated nodes.

# We have to use _dummy_obj because we need to keep a refcount to it, but we

# also use _dummy as a pointer, because it avoids having to put <PyObject*> all

# over the code base.

cdef object _dummy_obj

cdef PyObject *_dummy

_dummy_obj = object()

_dummy = <PyObject *>_dummy_obj

cdef int _is_equal(PyObject *this, long this_hash, PyObject *other):

cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:

cdef long other_hash

cdef PyObject *res

if this == other:

return 1

other_hash = Py_TYPE(other).tp_hash(other)

if other_hash == -1:

# Even though other successfully hashed in the past, it seems to have

# changed its mind, and failed this time, so propogate the failure.

return -1

if other_hash != this_hash:

return 0

# This implements a subset of the PyObject_RichCompareBool functionality.

# Namely it:

# 1) Doesn't try to do anything with old-style classes

# 2) Assumes that both objects have a tp_richcompare implementation, and

# that if that is not enough to compare equal, then they are not

# equal. (It doesn't try to cast them both to some intermediate form

# that would compare equal.)

res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)

if res == Py_True:

if res == NULL: # Exception

return -1

if PyObject_IsTrue(res):

Py_DECREF(res)

return 1

if res == Py_NotImplemented:

Py_DECREF(res)

res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)

if res == Py_True:

if res == NULL:

return -1

if PyObject_IsTrue(res):

Py_DECREF(res)

return 1

Py_DECREF(res)

110

"""

111

# Attributes are defined in the .pxd file

112

DEF DEFAULT_SIZE=1024

DEF PERTURB_SHIFT=5

113

114

def __init__(self):

115

cdef Py_ssize_t size, n_bytes

170

195

as it makes a lot of assuptions about keys not already being present,

171

196

and there being no dummy entries.

172

197

"""

173

cdef size_t i, perturb, mask

198

cdef size_t i, n_lookup

174

199

cdef long the_hash

175

cdef PyObject **table, **entry

200

cdef PyObject **table, **slot

201

cdef Py_ssize_t mask

176

202

177

203

mask = self._mask

178

204

table = self._table

179

205

180

206

the_hash = Py_TYPE(key).tp_hash(key)

181

i = the_hash & mask

182

entry = &table[i]

183

perturb = the_hash

184

# Because we know that we made all items unique before, we can just

185

# iterate as long as the target location is not empty, we don't have to

186

# do any comparison, etc.

187

while entry[0] != NULL:

188

i = (i << 2) + i + perturb + 1

189

entry = &table[i & mask]

190

perturb >>= PERTURB_SHIFT

191

entry[0] = key

192

self._fill += 1

193

self._used += 1

207

if the_hash == -1:

208

return -1

209

i = the_hash

210

for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever

211

slot = &table[i & mask]

212

if slot[0] == NULL:

213

slot[0] = key

214

self._fill = self._fill + 1

215

self._used = self._used + 1

216

return 1

217

i = i + 1 + n_lookup

218

raise RuntimeError('ran out of slots.')

194

219

195

220

def _py_resize(self, min_used):

196

221

"""Do not use this directly, it is only exposed for testing."""

206

231

:return: The new size of the internal table

207

232

"""

208

233

cdef Py_ssize_t new_size, n_bytes, remaining

209

cdef PyObject **new_table, **old_table, **entry

234

cdef PyObject **new_table, **old_table, **slot

210

235

211

236

new_size = DEFAULT_SIZE

212

237

while new_size <= min_used and new_size > 0:

236

261

237

262

# Moving everything to the other table is refcount neutral, so we don't

238

263

# worry about it.

239

entry = old_table

264

slot = old_table

240

265

while remaining > 0:

241

if entry[0] == NULL: # unused slot

266

if slot[0] == NULL: # unused slot

242

267

pass

243

elif entry[0] == _dummy: # dummy slot

244

remaining -= 1

268

elif slot[0] == _dummy: # dummy slot

269

remaining = remaining - 1

245

270

else: # active slot

246

remaining -= 1

247

self._insert_clean(entry[0])

248

entry += 1

271

remaining = remaining - 1

272

self._insert_clean(slot[0])

273

slot = slot + 1

249

274

PyMem_Free(old_table)

250

275

return new_size

251

276

262

287

cdef PyObject **slot, *py_key

263

288

cdef int added

264

289

290

py_key = <PyObject *>key

291

if (Py_TYPE(py_key).tp_richcompare == NULL

292

or Py_TYPE(py_key).tp_hash == NULL):

293

raise TypeError('Types added to SimpleSet must implement'

294

' both tp_richcompare and tp_hash')

265

295

added = 0

266

296

# We need at least one empty slot

267

297

assert self._used < self._mask

268

298

slot = _lookup(self, key)

269

py_key = <PyObject *>key

270

299

if (slot[0] == NULL):

271

300

Py_INCREF(py_key)

272

self._fill += 1

273

self._used += 1

301

self._fill = self._fill + 1

302

self._used = self._used + 1

274

303

slot[0] = py_key

275

304

added = 1

276

305

elif (slot[0] == _dummy):

277

306

Py_INCREF(py_key)

278

self._used += 1

307

self._used = self._used + 1

279

308

slot[0] = py_key

280

309

added = 1

281

310

# No else: clause. If _lookup returns a pointer to

291

320

return retval

292

321

293

322

def discard(self, key):

294

"""Remove key from the dict, whether it exists or not.

323

"""Remove key from the set, whether it exists or not.

295

324

296

:return: 0 if the item did not exist, 1 if it did

325

:return: False if the item did not exist, True if it did

297

326

"""

298

return self._discard(key)

327

if self._discard(key):

328

return True

329

return False

299

330

300

331

cdef int _discard(self, key) except -1:

301

332

cdef PyObject **slot, *py_key

303

334

slot = _lookup(self, key)

304

335

if slot[0] == NULL or slot[0] == _dummy:

305

336

return 0

306

self._used -= 1

337

self._used = self._used - 1

307

338

Py_DECREF(slot[0])

308

339

slot[0] = _dummy

309

340

# PySet uses the heuristic: If more than 1/5 are dummies, then resize

320

351

self._resize(self._used * 2)

321

352

return 1

322

353

323

def __delitem__(self, key):

324

"""Remove the given item from the dict.

325

326

Raise a KeyError if the key was not present.

327

"""

328

cdef int exists

329

exists = self._discard(key)

330

if not exists:

331

raise KeyError('Key %s not present' % (key,))

332

333

354

def __iter__(self):

334

355

return _SimpleSet_iterator(self)

335

356

353

374

354

375

def __next__(self):

355

376

cdef Py_ssize_t mask, i

356

cdef PyObject **table

377

cdef PyObject *key

357

378

358

379

if self.set is None:

359

380

raise StopIteration

361

382

# Force this exception to continue to be raised

362

383

self._used = -1

363

384

raise RuntimeError("Set size changed during iteration")

364

i = self.pos

365

mask = self.set._mask

366

table = self.set._table

367

assert i >= 0

368

while i <= mask and (table[i] == NULL or table[i] == _dummy):

369

i += 1

370

self.pos = i + 1

371

if i > mask:

372

# we walked to the end

385

if not SimpleSet_Next(self.set, &self.pos, &key):

373

386

self.set = None

374

387

raise StopIteration

375

# We must have found one

376

key = <object>(table[i])

377

self.len -= 1

378

return key

388

# we found something

389

the_key = <object>key # INCREF

390

self.len = self.len - 1

391

return the_key

379

392

380

393

def __length_hint__(self):

381

394

if self.set is not None and self._used == self.set._used:

411

424

412

425

:param key: An object we are looking up

413

426

:param hash: The hash for key

414

:return: The location in self.table where key should be put

415

should never be NULL, but may reference a NULL (PyObject*)

427

:return: The location in self.table where key should be put.

428

location == NULL is an exception, but (*location) == NULL just

429

indicates the slot is empty and can be used.

416

430

"""

417

# This is the heart of most functions, which is why it is pulled out as an

418

# cdef inline function.

419

cdef size_t i, perturb

431

# This uses Quadratic Probing:

432

# http://en.wikipedia.org/wiki/Quadratic_probing

433

# with c1 = c2 = 1/2

434

# This leads to probe locations at:

435

# h0 = hash(k1)

436

# h1 = h0 + 1

437

# h2 = h0 + 3 = h1 + 1 + 1

438

# h3 = h0 + 6 = h2 + 1 + 2

439

# h4 = h0 + 10 = h2 + 1 + 3

440

# Note that all of these are '& mask', but that is computed *after* the

441

# offset.

442

# This differs from the algorithm used by Set and Dict. Which, effectively,

443

# use double-hashing, and a step size that starts large, but dwindles to

444

# stepping one-by-one.

445

# This gives more 'locality' in that if you have a collision at offset X,

446

# the first fallback is X+1, which is fast to check. However, that means

447

# that an object w/ hash X+1 will also check there, and then X+2 next.

448

# However, for objects with differing hashes, their chains are different.

449

# The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...

450

# So different hashes diverge quickly.

451

# A bigger problem is that we *only* ever use the lowest bits of the hash

452

# So all integers (x + SIZE*N) will resolve into the same bucket, and all

453

# use the same collision resolution. We may want to try to find a way to

454

# incorporate the upper bits of the hash with quadratic probing. (For

455

# example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)

456

cdef size_t i, n_lookup

420

457

cdef Py_ssize_t mask

421

458

cdef long key_hash

422

cdef long this_hash

423

cdef PyObject **table, **cur, **free_slot, *py_key

459

cdef PyObject **table, **slot, *cur, **free_slot, *py_key

424

460

461

# hash is a signed long(), we are using an offset at unsigned size_t

425

462

key_hash = hash(key)

463

i = <size_t>key_hash

426

464

mask = self._mask

427

465

table = self._table

428

i = key_hash & mask

429

cur = &table[i]

466

free_slot = NULL

430

467

py_key = <PyObject *>key

431

if cur[0] == NULL:

432

# Found a blank spot, or found the exact key

433

return cur

434

if cur[0] == py_key:

435

return cur

436

if cur[0] == _dummy:

437

free_slot = cur

438

else:

439

if _is_equal(py_key, key_hash, cur[0]):

440

# Both py_key and cur[0] belong in this slot, return it

441

return cur

442

free_slot = NULL

443

# size_t is unsigned, hash is signed...

444

perturb = key_hash

445

while True:

446

i = (i << 2) + i + perturb + 1

447

cur = &table[i & mask]

448

if cur[0] == NULL: # Found an empty spot

449

if free_slot: # Did we find a _dummy earlier?

468

for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever

469

slot = &table[i & mask]

470

cur = slot[0]

471

if cur == NULL:

472

# Found a blank spot

473

if free_slot != NULL:

474

# Did we find an earlier _dummy entry?

450

475

return free_slot

451

476

else:

452

return cur

453

if (cur[0] == py_key # exact match

454

or _is_equal(py_key, key_hash, cur[0])): # Equivalent match

455

return cur

456

if (cur[0] == _dummy and free_slot == NULL):

457

free_slot = cur

458

perturb >>= PERTURB_SHIFT

477

return slot

478

if cur == py_key:

479

# Found an exact pointer to the key

480

return slot

481

if cur == _dummy:

482

if free_slot == NULL:

483

free_slot = slot

484

elif _is_equal(py_key, key_hash, cur):

485

# Both py_key and cur belong in this slot, return it

486

return slot

487

i = i + 1 + n_lookup

459

488

raise AssertionError('should never get here')

460

489

461

490

521

550

return _check_self(self)._used

522

551

523

552

524

# TODO: this should probably have direct tests, since it isn't used by __iter__

525

553

cdef api int SimpleSet_Next(object self, Py_ssize_t *pos, PyObject **key):

526

554

"""Walk over items in a SimpleSet.

527

555

540

568

mask = true_self._mask

541

569

table= true_self._table

542

570

while (i <= mask and (table[i] == NULL or table[i] == _dummy)):

543

i += 1

571

i = i + 1

544

572

pos[0] = i + 1

545

573

if (i > mask):

546

574

return 0 # All done

565

593

ret = visit(next_key, arg)

566

594

if ret:

567

595

return ret

568

569

return 0;

596

return 0

570

597

571

598

# It is a little bit ugly to do this, but it works, and means that Meliae can

572

599

# dump the total memory consumed by all child objects.

Older »