~bzr-pqm/bzr/bzr.dev : revision 4216.5.1

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

16

17

"""Persistent maps from tuple_of_strings->string using CHK stores.

18

19

Overview and current status:

20

21

The CHKMap class implements a dict from tuple_of_strings->string by using a trie

22

with internal nodes of 8-bit fan out; The key tuples are mapped to strings by

23

joining them by \x00, and \x00 padding shorter keys out to the length of the

24

longest key. Leaf nodes are packed as densely as possible, and internal nodes

25

are all an additional 8-bits wide leading to a sparse upper tree.

26

27

Updates to a CHKMap are done preferentially via the apply_delta method, to

28

allow optimisation of the update operation; but individual map/unmap calls are

29

possible and supported. All changes via map/unmap are buffered in memory until

30

the _save method is called to force serialisation of the tree. apply_delta

31

performs a _save implicitly.

32

33

TODO:

34

-----

35

36

Densely packed upper nodes.

37

38

"""

39

40

import heapq

41

import time

42

43

from bzrlib import lazy_import

44

lazy_import.lazy_import(globals(), """

45

from bzrlib import versionedfile

46

""")

47

from bzrlib import (

48

errors,

49

lru_cache,

50

osutils,

51

registry,

52

trace,

53

)

54

55

# approx 4MB

56

# If each line is 50 bytes, and you have 255 internal pages, with 255-way fan

57

# out, it takes 3.1MB to cache the layer.

58

_PAGE_CACHE_SIZE = 4*1024*1024

59

# We are caching bytes so len(value) is perfectly accurate

60

_page_cache = lru_cache.LRUSizeCache(_PAGE_CACHE_SIZE)

61

62

# If a ChildNode falls below this many bytes, we check for a remap

63

_INTERESTING_NEW_SIZE = 50

64

# If a ChildNode shrinks by more than this amount, we check for a remap

65

_INTERESTING_SHRINKAGE_LIMIT = 20

66

# If we delete more than this many nodes applying a delta, we check for a remap

67

_INTERESTING_DELETES_LIMIT = 5

68

69

70

def _search_key_plain(key):

71

"""Map the key tuple into a search string that just uses the key bytes."""

72

return '\x00'.join(key)

73

74

75

search_key_registry = registry.Registry()

76

search_key_registry.register('plain', _search_key_plain)

77

78

79

class CHKMap(object):

80

"""A persistent map from string to string backed by a CHK store."""

81

82

def __init__(self, store, root_key, search_key_func=None):

83

"""Create a CHKMap object.

84

85

:param store: The store the CHKMap is stored in.

86

:param root_key: The root key of the map. None to create an empty

87

CHKMap.

88

:param search_key_func: A function mapping a key => bytes. These bytes

89

are then used by the internal nodes to split up leaf nodes into

90

multiple pages.

91

"""

92

self._store = store

93

if search_key_func is None:

94

search_key_func = _search_key_plain

95

self._search_key_func = search_key_func

96

if root_key is None:

97

self._root_node = LeafNode(search_key_func=search_key_func)

98

else:

99

self._root_node = self._node_key(root_key)

100

101

def apply_delta(self, delta):

102

"""Apply a delta to the map.

103

104

:param delta: An iterable of old_key, new_key, new_value tuples.

105

If new_key is not None, then new_key->new_value is inserted

106

into the map; if old_key is not None, then the old mapping

107

of old_key is removed.

108

"""

109

delete_count = 0

110

for old, new, value in delta:

111

if old is not None and old != new:

112

self.unmap(old, check_remap=False)

113

delete_count += 1

114

for old, new, value in delta:

115

if new is not None:

116

self.map(new, value)

117

if delete_count > _INTERESTING_DELETES_LIMIT:

118

trace.mutter("checking remap as %d deletions", delete_count)

119

self._check_remap()

120

return self._save()

121

122

def _ensure_root(self):

123

"""Ensure that the root node is an object not a key."""

124

if type(self._root_node) == tuple:

125

# Demand-load the root

126

self._root_node = self._get_node(self._root_node)

127

128

def _get_node(self, node):

129

"""Get a node.

130

131

Note that this does not update the _items dict in objects containing a

132

reference to this node. As such it does not prevent subsequent IO being

133

performed.

134

135

:param node: A tuple key or node object.

136

:return: A node object.

137

"""

138

if type(node) == tuple:

139

bytes = self._read_bytes(node)

140

return _deserialise(bytes, node,

141

search_key_func=self._search_key_func)

142

else:

143

return node

144

145

def _read_bytes(self, key):

146

try:

147

return _page_cache[key]

148

except KeyError:

149

stream = self._store.get_record_stream([key], 'unordered', True)

150

bytes = stream.next().get_bytes_as('fulltext')

151

_page_cache[key] = bytes

152

return bytes

153

154

def _dump_tree(self, include_keys=False):

155

"""Return the tree in a string representation."""

156

self._ensure_root()

157

res = self._dump_tree_node(self._root_node, prefix='', indent='',

158

include_keys=include_keys)

159

res.append('') # Give a trailing '\n'

160

return '\n'.join(res)

161

162

def _dump_tree_node(self, node, prefix, indent, include_keys=True):

163

"""For this node and all children, generate a string representation."""

164

result = []

165

if not include_keys:

166

key_str = ''

167

else:

168

node_key = node.key()

169

if node_key is not None:

170

key_str = ' %s' % (node_key[0],)

171

else:

172

key_str = ' None'

173

result.append('%s%r %s%s' % (indent, prefix, node.__class__.__name__,

174

key_str))

175

if type(node) is InternalNode:

176

# Trigger all child nodes to get loaded

177

list(node._iter_nodes(self._store))

178

for prefix, sub in sorted(node._items.iteritems()):

179

result.extend(self._dump_tree_node(sub, prefix, indent + ' ',

180

include_keys=include_keys))

181

else:

182

for key, value in sorted(node._items.iteritems()):

183

# Don't use prefix nor indent here to line up when used in

184

# tests in conjunction with assertEqualDiff

185

result.append(' %r %r' % (key, value))

186

return result

187

188

@classmethod

189

def from_dict(klass, store, initial_value, maximum_size=0, key_width=1,

190

search_key_func=None):

191

"""Create a CHKMap in store with initial_value as the content.

192

193

:param store: The store to record initial_value in, a VersionedFiles

194

object with 1-tuple keys supporting CHK key generation.

195

:param initial_value: A dict to store in store. Its keys and values

196

must be bytestrings.

197

:param maximum_size: The maximum_size rule to apply to nodes. This

198

determines the size at which no new data is added to a single node.

199

:param key_width: The number of elements in each key_tuple being stored

200

in this map.

201

:param search_key_func: A function mapping a key => bytes. These bytes

202

are then used by the internal nodes to split up leaf nodes into

203

multiple pages.

204

:return: The root chk of the resulting CHKMap.

205

"""

206

result = CHKMap(store, None, search_key_func=search_key_func)

207

result._root_node.set_maximum_size(maximum_size)

208

result._root_node._key_width = key_width

209

delta = []

210

for key, value in initial_value.items():

211

delta.append((None, key, value))

212

return result.apply_delta(delta)

213

214

def iter_changes(self, basis):

215

"""Iterate over the changes between basis and self.

216

217

:return: An iterator of tuples: (key, old_value, new_value). Old_value

218

is None for keys only in self; new_value is None for keys only in

219

basis.

220

"""

221

# Overview:

222

# Read both trees in lexographic, highest-first order.

223

# Any identical nodes we skip

224

# Any unique prefixes we output immediately.

225

# values in a leaf node are treated as single-value nodes in the tree

226

# which allows them to be not-special-cased. We know to output them

227

# because their value is a string, not a key(tuple) or node.

228

#

229

# corner cases to beware of when considering this function:

230

# *) common references are at different heights.

231

# consider two trees:

232

# {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}

233

# {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'},

234

# 'ab':LeafNode={'ab':'bar'}}

235

# 'b': LeafNode={'b'}}

236

# the node with aaa/aab will only be encountered in the second tree

237

# after reading the 'a' subtree, but it is encountered in the first

238

# tree immediately. Variations on this may have read internal nodes

239

# like this. we want to cut the entire pending subtree when we

240

# realise we have a common node. For this we use a list of keys -

241

# the path to a node - and check the entire path is clean as we

242

# process each item.

243

if self._node_key(self._root_node) == self._node_key(basis._root_node):

244

return

245

self._ensure_root()

246

basis._ensure_root()

247

excluded_keys = set()

248

self_node = self._root_node

249

basis_node = basis._root_node

250

# A heap, each element is prefix, node(tuple/NodeObject/string),

251

# key_path (a list of tuples, tail-sharing down the tree.)

252

self_pending = []

253

basis_pending = []

254

def process_node(node, path, a_map, pending):

255

# take a node and expand it

256

node = a_map._get_node(node)

257

if type(node) == LeafNode:

258

path = (node._key, path)

259

for key, value in node._items.items():

260

# For a LeafNode, the key is a serialized_key, rather than

261

# a search_key, but the heap is using search_keys

262

search_key = node._search_key_func(key)

263

heapq.heappush(pending, (search_key, key, value, path))

264

else:

265

# type(node) == InternalNode

266

path = (node._key, path)

267

for prefix, child in node._items.items():

268

heapq.heappush(pending, (prefix, None, child, path))

269

def process_common_internal_nodes(self_node, basis_node):

270

self_items = set(self_node._items.items())

271

basis_items = set(basis_node._items.items())

272

path = (self_node._key, None)

273

for prefix, child in self_items - basis_items:

274

heapq.heappush(self_pending, (prefix, None, child, path))

275

path = (basis_node._key, None)

276

for prefix, child in basis_items - self_items:

277

heapq.heappush(basis_pending, (prefix, None, child, path))

278

def process_common_leaf_nodes(self_node, basis_node):

279

self_items = set(self_node._items.items())

280

basis_items = set(basis_node._items.items())

281

path = (self_node._key, None)

282

for key, value in self_items - basis_items:

283

prefix = self._search_key_func(key)

284

heapq.heappush(self_pending, (prefix, key, value, path))

285

path = (basis_node._key, None)

286

for key, value in basis_items - self_items:

287

prefix = basis._search_key_func(key)

288

heapq.heappush(basis_pending, (prefix, key, value, path))

289

def process_common_prefix_nodes(self_node, self_path,

290

basis_node, basis_path):

291

# Would it be more efficient if we could request both at the same

292

# time?

293

self_node = self._get_node(self_node)

294

basis_node = basis._get_node(basis_node)

295

if (type(self_node) == InternalNode

296

and type(basis_node) == InternalNode):

297

# Matching internal nodes

298

process_common_internal_nodes(self_node, basis_node)

299

elif (type(self_node) == LeafNode

300

and type(basis_node) == LeafNode):

301

process_common_leaf_nodes(self_node, basis_node)

302

else:

303

process_node(self_node, self_path, self, self_pending)

304

process_node(basis_node, basis_path, basis, basis_pending)

305

process_common_prefix_nodes(self_node, None, basis_node, None)

306

self_seen = set()

307

basis_seen = set()

308

excluded_keys = set()

309

def check_excluded(key_path):

310

# Note that this is N^2, it depends on us trimming trees

311

# aggressively to not become slow.

312

# A better implementation would probably have a reverse map

313

# back to the children of a node, and jump straight to it when

314

# a common node is detected, the proceed to remove the already

315

# pending children. bzrlib.graph has a searcher module with a

316

# similar problem.

317

while key_path is not None:

318

key, key_path = key_path

319

if key in excluded_keys:

320

return True

321

return False

322

323

loop_counter = 0

324

while self_pending or basis_pending:

325

loop_counter += 1

326

if not self_pending:

327

# self is exhausted: output remainder of basis

328

for prefix, key, node, path in basis_pending:

329

if check_excluded(path):

330

continue

331

node = basis._get_node(node)

332

if key is not None:

333

# a value

334

yield (key, node, None)

335

else:

336

# subtree - fastpath the entire thing.

337

for key, value in node.iteritems(basis._store):

338

yield (key, value, None)

339

return

340

elif not basis_pending:

341

# basis is exhausted: output remainder of self.

342

for prefix, key, node, path in self_pending:

343

if check_excluded(path):

344

continue

345

node = self._get_node(node)

346

if key is not None:

347

# a value

348

yield (key, None, node)

349

else:

350

# subtree - fastpath the entire thing.

351

for key, value in node.iteritems(self._store):

352

yield (key, None, value)

353

return

354

else:

355

# XXX: future optimisation - yield the smaller items

356

# immediately rather than pushing everything on/off the

357

# heaps. Applies to both internal nodes and leafnodes.

358

if self_pending[0][0] < basis_pending[0][0]:

359

# expand self

360

prefix, key, node, path = heapq.heappop(self_pending)

361

if check_excluded(path):

362

continue

363

if key is not None:

364

# a value

365

yield (key, None, node)

366

else:

367

process_node(node, path, self, self_pending)

368

continue

369

elif self_pending[0][0] > basis_pending[0][0]:

370

# expand basis

371

prefix, key, node, path = heapq.heappop(basis_pending)

372

if check_excluded(path):

373

continue

374

if key is not None:

375

# a value

376

yield (key, node, None)

377

else:

378

process_node(node, path, basis, basis_pending)

379

continue

380

else:

381

# common prefix: possibly expand both

382

if self_pending[0][1] is None:

383

# process next self

384

read_self = True

385

else:

386

read_self = False

387

if basis_pending[0][1] is None:

388

# process next basis

389

read_basis = True

390

else:

391

read_basis = False

392

if not read_self and not read_basis:

393

# compare a common value

394

self_details = heapq.heappop(self_pending)

395

basis_details = heapq.heappop(basis_pending)

396

if self_details[2] != basis_details[2]:

397

yield (self_details[1],

398

basis_details[2], self_details[2])

399

continue

400

# At least one side wasn't a simple value

401

if (self._node_key(self_pending[0][2]) ==

402

self._node_key(basis_pending[0][2])):

403

# Identical pointers, skip (and don't bother adding to

404

# excluded, it won't turn up again.

405

heapq.heappop(self_pending)

406

heapq.heappop(basis_pending)

407

continue

408

# Now we need to expand this node before we can continue

409

if read_self and read_basis:

410

# Both sides start with the same prefix, so process

411

# them in parallel

412

self_prefix, _, self_node, self_path = heapq.heappop(

413

self_pending)

414

basis_prefix, _, basis_node, basis_path = heapq.heappop(

415

basis_pending)

416

assert self_prefix == basis_prefix

417

process_common_prefix_nodes(

418

self_node, self_path,

419

basis_node, basis_path)

420

continue

421

if read_self:

422

prefix, key, node, path = heapq.heappop(self_pending)

423

if check_excluded(path):

424

continue

425

process_node(node, path, self, self_pending)

426

if read_basis:

427

prefix, key, node, path = heapq.heappop(basis_pending)

428

if check_excluded(path):

429

continue

430

process_node(node, path, basis, basis_pending)

431

# print loop_counter

432

433

def iteritems(self, key_filter=None):

434

"""Iterate over the entire CHKMap's contents."""

435

self._ensure_root()

436

return self._root_node.iteritems(self._store, key_filter=key_filter)

437

438

def key(self):

439

"""Return the key for this map."""

440

if type(self._root_node) is tuple:

441

return self._root_node

442

else:

443

return self._root_node._key

444

445

def __len__(self):

446

self._ensure_root()

447

return len(self._root_node)

448

449

def map(self, key, value):

450

"""Map a key tuple to value."""

451

# Need a root object.

452

self._ensure_root()

453

prefix, node_details = self._root_node.map(self._store, key, value)

454

if len(node_details) == 1:

455

self._root_node = node_details[0][1]

456

else:

457

self._root_node = InternalNode(prefix,

458

search_key_func=self._search_key_func)

459

self._root_node.set_maximum_size(node_details[0][1].maximum_size)

460

self._root_node._key_width = node_details[0][1]._key_width

461

for split, node in node_details:

462

self._root_node.add_node(split, node)

463

464

def _node_key(self, node):

465

"""Get the key for a node whether it's a tuple or node."""

466

if type(node) == tuple:

467

return node

468

else:

469

return node._key

470

471

def unmap(self, key, check_remap=True):

472

"""remove key from the map."""

473

self._ensure_root()

474

if type(self._root_node) is InternalNode:

475

unmapped = self._root_node.unmap(self._store, key,

476

check_remap=check_remap)

477

else:

478

unmapped = self._root_node.unmap(self._store, key)

479

self._root_node = unmapped

480

481

def _check_remap(self):

482

"""Check if nodes can be collapsed."""

483

self._ensure_root()

484

if type(self._root_node) is InternalNode:

485

self._root_node._check_remap(self._store)

486

487

def _save(self):

488

"""Save the map completely.

489

490

:return: The key of the root node.

491

"""

492

if type(self._root_node) == tuple:

493

# Already saved.

494

return self._root_node

495

keys = list(self._root_node.serialise(self._store))

496

return keys[-1]

497

498

499

class Node(object):

500

"""Base class defining the protocol for CHK Map nodes.

501

502

:ivar _raw_size: The total size of the serialized key:value data, before

503

adding the header bytes, and without prefix compression.

504

"""

505

506

def __init__(self, key_width=1):

507

"""Create a node.

508

509

:param key_width: The width of keys for this node.

510

"""

511

self._key = None

512

# Current number of elements

513

self._len = 0

514

self._maximum_size = 0

515

self._key_width = key_width

516

# current size in bytes

517

self._raw_size = 0

518

# The pointers/values this node has - meaning defined by child classes.

519

self._items = {}

520

# The common search prefix

521

self._search_prefix = None

522

523

def __repr__(self):

524

items_str = str(sorted(self._items))

525

if len(items_str) > 20:

526

items_str = items_str[:16] + '...]'

527

return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % (

528

self.__class__.__name__, self._key, self._len, self._raw_size,

529

self._maximum_size, self._search_prefix, items_str)

530

531

def key(self):

532

return self._key

533

534

def __len__(self):

535

return self._len

536

537

@property

538

def maximum_size(self):

539

"""What is the upper limit for adding references to a node."""

540

return self._maximum_size

541

542

def set_maximum_size(self, new_size):

543

"""Set the size threshold for nodes.

544

545

:param new_size: The size at which no data is added to a node. 0 for

546

unlimited.

547

"""

548

self._maximum_size = new_size

549

550

@classmethod

551

def common_prefix(cls, prefix, key):

552

"""Given 2 strings, return the longest prefix common to both.

553

554

:param prefix: This has been the common prefix for other keys, so it is

555

more likely to be the common prefix in this case as well.

556

:param key: Another string to compare to

557

"""

558

if key.startswith(prefix):

559

return prefix

560

# Is there a better way to do this?

561

for pos, (left, right) in enumerate(zip(prefix, key)):

562

if left != right:

563

pos -= 1

564

break

565

common = prefix[:pos+1]

566

return common

567

568

@classmethod

569

def common_prefix_for_keys(cls, keys):

570

"""Given a list of keys, find their common prefix.

571

572

:param keys: An iterable of strings.

573

:return: The longest common prefix of all keys.

574

"""

575

common_prefix = None

576

for key in keys:

577

if common_prefix is None:

578

common_prefix = key

579

continue

580

common_prefix = cls.common_prefix(common_prefix, key)

581

if not common_prefix:

582

# if common_prefix is the empty string, then we know it won't

583

# change further

584

return ''

585

return common_prefix

586

587

588

# Singleton indicating we have not computed _search_prefix yet

589

_unknown = object()

590

591

class LeafNode(Node):

592

"""A node containing actual key:value pairs.

593

594

:ivar _items: A dict of key->value items. The key is in tuple form.

595

:ivar _size: The number of bytes that would be used by serializing all of

596

the key/value pairs.

597

"""

598

599

def __init__(self, search_key_func=None):

600

Node.__init__(self)

601

# All of the keys in this leaf node share this common prefix

602

self._common_serialised_prefix = None

603

self._serialise_key = '\x00'.join

604

if search_key_func is None:

605

self._search_key_func = _search_key_plain

606

else:

607

self._search_key_func = search_key_func

608

609

def __repr__(self):

610

items_str = str(sorted(self._items))

611

if len(items_str) > 20:

612

items_str = items_str[:16] + '...]'

613

return \

614

'%s(key:%s len:%s size:%s max:%s prefix:%s keywidth:%s items:%s)' \

615

% (self.__class__.__name__, self._key, self._len, self._raw_size,

616

self._maximum_size, self._search_prefix, self._key_width, items_str)

617

618

def _current_size(self):

619

"""Answer the current serialised size of this node.

620

621

This differs from self._raw_size in that it includes the bytes used for

622

the header.

623

"""

624

if self._common_serialised_prefix is None:

625

bytes_for_items = 0

626

prefix_len = 0

627

else:

628

# We will store a single string with the common prefix

629

# And then that common prefix will not be stored in any of the

630

# entry lines

631

prefix_len = len(self._common_serialised_prefix)

632

bytes_for_items = (self._raw_size - (prefix_len * self._len))

633

return (9 # 'chkleaf:\n'

634

+ len(str(self._maximum_size)) + 1

635

+ len(str(self._key_width)) + 1

636

+ len(str(self._len)) + 1

637

+ prefix_len + 1

638

+ bytes_for_items)

639

640

@classmethod

641

def deserialise(klass, bytes, key, search_key_func=None):

642

"""Deserialise bytes, with key key, into a LeafNode.

643

644

:param bytes: The bytes of the node.

645

:param key: The key that the serialised node has.

646

"""

647

return _deserialise_leaf_node(bytes, key,

648

search_key_func=search_key_func)

649

650

def iteritems(self, store, key_filter=None):

651

"""Iterate over items in the node.

652

653

:param key_filter: A filter to apply to the node. It should be a

654

list/set/dict or similar repeatedly iterable container.

655

"""

656

if key_filter is not None:

657

# Adjust the filter - short elements go to a prefix filter. All

658

# other items are looked up directly.

659

# XXX: perhaps defaultdict? Profiling<rinse and repeat>

660

filters = {}

661

for key in key_filter:

662

if len(key) == self._key_width:

663

# This filter is meant to match exactly one key, yield it

664

# if we have it.

665

try:

666

yield key, self._items[key]

667

except KeyError:

668

# This key is not present in this map, continue

669

pass

670

else:

671

# Short items, we need to match based on a prefix

672

length_filter = filters.setdefault(len(key), set())

673

length_filter.add(key)

674

if filters:

675

filters = filters.items()

676

for item in self._items.iteritems():

677

for length, length_filter in filters:

678

if item[0][:length] in length_filter:

679

yield item

680

break

681

else:

682

for item in self._items.iteritems():

683

yield item

684

685

def _key_value_len(self, key, value):

686

# TODO: Should probably be done without actually joining the key, but

687

# then that can be done via the C extension

688

return (len(self._serialise_key(key)) + 1

689

+ len(str(value.count('\n'))) + 1

690

+ len(value) + 1)

691

692

def _search_key(self, key):

693

return self._search_key_func(key)

694

695

def _map_no_split(self, key, value):

696

"""Map a key to a value.

697

698

This assumes either the key does not already exist, or you have already

699

removed its size and length from self.

700

701

:return: True if adding this node should cause us to split.

702

"""

703

self._items[key] = value

704

self._raw_size += self._key_value_len(key, value)

705

self._len += 1

706

serialised_key = self._serialise_key(key)

707

if self._common_serialised_prefix is None:

708

self._common_serialised_prefix = serialised_key

709

else:

710

self._common_serialised_prefix = self.common_prefix(

711

self._common_serialised_prefix, serialised_key)

712

search_key = self._search_key(key)

713

if self._search_prefix is _unknown:

714

self._compute_search_prefix()

715

if self._search_prefix is None:

716

self._search_prefix = search_key

717

else:

718

self._search_prefix = self.common_prefix(

719

self._search_prefix, search_key)

720

if (self._len > 1

721

and self._maximum_size

722

and self._current_size() > self._maximum_size):

723

# Check to see if all of the search_keys for this node are

724

# identical. We allow the node to grow under that circumstance

725

# (we could track this as common state, but it is infrequent)

726

if (search_key != self._search_prefix

727

or not self._are_search_keys_identical()):

728

return True

729

return False

730

731

def _split(self, store):

732

"""We have overflowed.

733

734

Split this node into multiple LeafNodes, return it up the stack so that

735

the next layer creates a new InternalNode and references the new nodes.

736

737

:return: (common_serialised_prefix, [(node_serialised_prefix, node)])

738

"""

739

assert self._search_prefix is not _unknown

740

common_prefix = self._search_prefix

741

split_at = len(common_prefix) + 1

742

result = {}

743

for key, value in self._items.iteritems():

744

search_key = self._search_key(key)

745

prefix = search_key[:split_at]

746

# TODO: Generally only 1 key can be exactly the right length,

747

# which means we can only have 1 key in the node pointed

748

# at by the 'prefix\0' key. We might want to consider

749

# folding it into the containing InternalNode rather than

750

# having a fixed length-1 node.

751

# Note this is probably not true for hash keys, as they

752

# may get a '\00' node anywhere, but won't have keys of

753

# different lengths.

754

if len(prefix) < split_at:

755

prefix += '\x00'*(split_at - len(prefix))

756

if prefix not in result:

757

node = LeafNode(search_key_func=self._search_key_func)

758

node.set_maximum_size(self._maximum_size)

759

node._key_width = self._key_width

760

result[prefix] = node

761

else:

762

node = result[prefix]

763

node.map(store, key, value)

764

return common_prefix, result.items()

765

766

def map(self, store, key, value):

767

"""Map key to value."""

768

if key in self._items:

769

self._raw_size -= self._key_value_len(key, self._items[key])

770

self._len -= 1

771

self._key = None

772

if self._map_no_split(key, value):

773

return self._split(store)

774

else:

775

assert self._search_prefix is not _unknown

776

return self._search_prefix, [("", self)]

777

778

def serialise(self, store):

779

"""Serialise the LeafNode to store.

780

781

:param store: A VersionedFiles honouring the CHK extensions.

782

:return: An iterable of the keys inserted by this operation.

783

"""

784

lines = ["chkleaf:\n"]

785

lines.append("%d\n" % self._maximum_size)

786

lines.append("%d\n" % self._key_width)

787

lines.append("%d\n" % self._len)

788

if self._common_serialised_prefix is None:

789

lines.append('\n')

790

if len(self._items) != 0:

791

raise AssertionError('If _common_serialised_prefix is None'

792

' we should have no items')

793

else:

794

lines.append('%s\n' % (self._common_serialised_prefix,))

795

prefix_len = len(self._common_serialised_prefix)

796

for key, value in sorted(self._items.items()):

797

# Always add a final newline

798

value_lines = osutils.chunks_to_lines([value + '\n'])

799

serialized = "%s\x00%s\n" % (self._serialise_key(key),

800

len(value_lines))

801

if not serialized.startswith(self._common_serialised_prefix):

802

raise AssertionError('We thought the common prefix was %r'

803

' but entry %r does not have it in common'

804

% (self._common_serialised_prefix, serialized))

805

lines.append(serialized[prefix_len:])

806

lines.extend(value_lines)

807

sha1, _, _ = store.add_lines((None,), (), lines)

808

self._key = ("sha1:" + sha1,)

809

bytes = ''.join(lines)

810

if len(bytes) != self._current_size():

811

raise AssertionError('Invalid _current_size')

812

_page_cache.add(self._key, bytes)

813

return [self._key]

814

815

def refs(self):

816

"""Return the references to other CHK's held by this node."""

817

return []

818

819

def _compute_search_prefix(self):

820

"""Determine the common search prefix for all keys in this node.

821

822

:return: A bytestring of the longest search key prefix that is

823

unique within this node.

824

"""

825

search_keys = [self._search_key_func(key) for key in self._items]

826

self._search_prefix = self.common_prefix_for_keys(search_keys)

827

return self._search_prefix

828

829

def _are_search_keys_identical(self):

830

"""Check to see if the search keys for all entries are the same.

831

832

When using a hash as the search_key it is possible for non-identical

833

keys to collide. If that happens enough, we may try overflow a

834

LeafNode, but as all are collisions, we must not split.

835

"""

836

common_search_key = None

837

for key in self._items:

838

search_key = self._search_key(key)

839

if common_search_key is None:

840

common_search_key = search_key

841

elif search_key != common_search_key:

842

return False

843

return True

844

845

def _compute_serialised_prefix(self):

846

"""Determine the common prefix for serialised keys in this node.

847

848

:return: A bytestring of the longest serialised key prefix that is

849

unique within this node.

850

"""

851

serialised_keys = [self._serialise_key(key) for key in self._items]

852

self._common_serialised_prefix = self.common_prefix_for_keys(

853

serialised_keys)

854

return self._common_serialised_prefix

855

856

def unmap(self, store, key):

857

"""Unmap key from the node."""

858

try:

859

self._raw_size -= self._key_value_len(key, self._items[key])

860

except KeyError:

861

trace.mutter("key %s not found in %r", key, self._items)

862

raise

863

self._len -= 1

864

del self._items[key]

865

self._key = None

866

# Recompute from scratch

867

self._compute_search_prefix()

868

self._compute_serialised_prefix()

869

return self

870

871

872

class InternalNode(Node):

873

"""A node that contains references to other nodes.

874

875

An InternalNode is responsible for mapping search key prefixes to child

876

nodes.

877

878

:ivar _items: serialised_key => node dictionary. node may be a tuple,

879

LeafNode or InternalNode.

880

"""

881

882

def __init__(self, prefix='', search_key_func=None):

883

Node.__init__(self)

884

# The size of an internalnode with default values and no children.

885

# How many octets key prefixes within this node are.

886

self._node_width = 0

887

self._search_prefix = prefix

888

if search_key_func is None:

889

self._search_key_func = _search_key_plain

890

else:

891

self._search_key_func = search_key_func

892

893

def add_node(self, prefix, node):

894

"""Add a child node with prefix prefix, and node node.

895

896

:param prefix: The search key prefix for node.

897

:param node: The node being added.

898

"""

899

if self._search_prefix is None:

900

raise AssertionError("_search_prefix should not be None")

901

if not prefix.startswith(self._search_prefix):

902

raise AssertionError("prefixes mismatch: %s must start with %s"

903

% (prefix,self._search_prefix))

904

if len(prefix) != len(self._search_prefix) + 1:

905

raise AssertionError("prefix wrong length: len(%s) is not %d" %

906

(prefix, len(self._search_prefix) + 1))

907

self._len += len(node)

908

if not len(self._items):

909

self._node_width = len(prefix)

910

if self._node_width != len(self._search_prefix) + 1:

911

raise AssertionError("node width mismatch: %d is not %d" %

912

(self._node_width, len(self._search_prefix) + 1))

913

self._items[prefix] = node

914

self._key = None

915

916

def _current_size(self):

917

"""Answer the current serialised size of this node."""

918

return (self._raw_size + len(str(self._len)) + len(str(self._key_width)) +

919

len(str(self._maximum_size)))

920

921

@classmethod

922

def deserialise(klass, bytes, key, search_key_func=None):

923

"""Deserialise bytes to an InternalNode, with key key.

924

925

:param bytes: The bytes of the node.

926

:param key: The key that the serialised node has.

927

:return: An InternalNode instance.

928

"""

929

return _deserialise_internal_node(bytes, key,

930

search_key_func=search_key_func)

931

932

def iteritems(self, store, key_filter=None):

933

for node, node_filter in self._iter_nodes(store, key_filter=key_filter):

934

for item in node.iteritems(store, key_filter=node_filter):

935

yield item

936

937

def _iter_nodes(self, store, key_filter=None, batch_size=None):

938

"""Iterate over node objects which match key_filter.

939

940

:param store: A store to use for accessing content.

941

:param key_filter: A key filter to filter nodes. Only nodes that might

942

contain a key in key_filter will be returned.

943

:param batch_size: If not None, then we will return the nodes that had

944

to be read using get_record_stream in batches, rather than reading

945

them all at once.

946

:return: An iterable of nodes. This function does not have to be fully

947

consumed. (There will be no pending I/O when items are being returned.)

948

"""

949

# Map from chk key ('sha1:...',) to (prefix, key_filter)

950

# prefix is the key in self._items to use, key_filter is the key_filter

951

# entries that would match this node

952

keys = {}

953

if key_filter is None:

954

for prefix, node in self._items.iteritems():

955

if type(node) == tuple:

956

keys[node] = (prefix, None)

957

else:

958

yield node, None

959

else:

960

# XXX defaultdict ?

961

prefix_to_keys = {}

962

length_filters = {}

963

for key in key_filter:

964

search_key = self._search_prefix_filter(key)

965

length_filter = length_filters.setdefault(

966

len(search_key), set())

967

length_filter.add(search_key)

968

prefix_to_keys.setdefault(search_key, []).append(key)

969

length_filters = length_filters.items()

970

for prefix, node in self._items.iteritems():

971

node_key_filter = []

972

for length, length_filter in length_filters:

973

sub_prefix = prefix[:length]

974

if sub_prefix in length_filter:

975

node_key_filter.extend(prefix_to_keys[sub_prefix])

976

if node_key_filter: # this key matched something, yield it

977

if type(node) == tuple:

978

keys[node] = (prefix, node_key_filter)

979

else:

980

yield node, node_key_filter

981

if keys:

982

# Look in the page cache for some more bytes

983

found_keys = set()

984

for key in keys:

985

try:

986

bytes = _page_cache[key]

987

except KeyError:

988

continue

989

else:

990

node = _deserialise(bytes, key,

991

search_key_func=self._search_key_func)

992

prefix, node_key_filter = keys[key]

993

self._items[prefix] = node

994

found_keys.add(key)

995

yield node, node_key_filter

996

for key in found_keys:

997

del keys[key]

998

if keys:

999

# demand load some pages.

1000

if batch_size is None:

1001

# Read all the keys in

1002

batch_size = len(keys)

1003

key_order = list(keys)

1004

for batch_start in range(0, len(key_order), batch_size):

1005

batch = key_order[batch_start:batch_start + batch_size]

1006

# We have to fully consume the stream so there is no pending

1007

# I/O, so we buffer the nodes for now.

1008

stream = store.get_record_stream(batch, 'unordered', True)

1009

node_and_filters = []

1010

for record in stream:

1011

bytes = record.get_bytes_as('fulltext')

1012

node = _deserialise(bytes, record.key,

1013

search_key_func=self._search_key_func)

1014

prefix, node_key_filter = keys[record.key]

1015

node_and_filters.append((node, node_key_filter))

1016

self._items[prefix] = node

1017

_page_cache.add(record.key, bytes)

1018

for info in node_and_filters:

1019

yield info

1020

1021

def map(self, store, key, value):

1022

"""Map key to value."""

1023

if not len(self._items):

1024

raise AssertionError("can't map in an empty InternalNode.")

1025

search_key = self._search_key(key)

1026

if self._node_width != len(self._search_prefix) + 1:

1027

raise AssertionError("node width mismatch: %d is not %d" %

1028

(self._node_width, len(self._search_prefix) + 1))

1029

if not search_key.startswith(self._search_prefix):

1030

# This key doesn't fit in this index, so we need to split at the

1031

# point where it would fit, insert self into that internal node,

1032

# and then map this key into that node.

1033

new_prefix = self.common_prefix(self._search_prefix,

1034

search_key)

1035

new_parent = InternalNode(new_prefix,

1036

search_key_func=self._search_key_func)

1037

new_parent.set_maximum_size(self._maximum_size)

1038

new_parent._key_width = self._key_width

1039

new_parent.add_node(self._search_prefix[:len(new_prefix)+1],

1040

self)

1041

return new_parent.map(store, key, value)

1042

children = [node for node, _

1043

in self._iter_nodes(store, key_filter=[key])]

1044

if children:

1045

child = children[0]

1046

else:

1047

# new child needed:

1048

child = self._new_child(search_key, LeafNode)

1049

old_len = len(child)

1050

if type(child) is LeafNode:

1051

old_size = child._current_size()

1052

else:

1053

old_size = None

1054

prefix, node_details = child.map(store, key, value)

1055

if len(node_details) == 1:

1056

# child may have shrunk, or might be a new node

1057

child = node_details[0][1]

1058

self._len = self._len - old_len + len(child)

1059

self._items[search_key] = child

1060

self._key = None

1061

new_node = self

1062

if type(child) is LeafNode:

1063

if old_size is None:

1064

# The old node was an InternalNode which means it has now

1065

# collapsed, so we need to check if it will chain to a

1066

# collapse at this level.

1067

trace.mutter("checking remap as InternalNode -> LeafNode")

1068

new_node = self._check_remap(store)

1069

else:

1070

# If the LeafNode has shrunk in size, we may want to run

1071

# a remap check. Checking for a remap is expensive though

1072

# and the frequency of a successful remap is very low.

1073

# Shrinkage by small amounts is common, so we only do the

1074

# remap check if the new_size is low or the shrinkage

1075

# amount is over a configurable limit.

1076

new_size = child._current_size()

1077

shrinkage = old_size - new_size

1078

if (shrinkage > 0 and new_size < _INTERESTING_NEW_SIZE

1079

or shrinkage > _INTERESTING_SHRINKAGE_LIMIT):

1080

trace.mutter(

1081

"checking remap as size shrunk by %d to be %d",

1082

shrinkage, new_size)

1083

new_node = self._check_remap(store)

1084

if new_node._search_prefix is None:

1085

raise AssertionError("_search_prefix should not be None")

1086

return new_node._search_prefix, [('', new_node)]

1087

# child has overflown - create a new intermediate node.

1088

# XXX: This is where we might want to try and expand our depth

1089

# to refer to more bytes of every child (which would give us

1090

# multiple pointers to child nodes, but less intermediate nodes)

1091

child = self._new_child(search_key, InternalNode)

1092

child._search_prefix = prefix

1093

for split, node in node_details:

1094

child.add_node(split, node)

1095

self._len = self._len - old_len + len(child)

1096

self._key = None

1097

return self._search_prefix, [("", self)]

1098

1099

def _new_child(self, search_key, klass):

1100

"""Create a new child node of type klass."""

1101

child = klass()

1102

child.set_maximum_size(self._maximum_size)

1103

child._key_width = self._key_width

1104

child._search_key_func = self._search_key_func

1105

self._items[search_key] = child

1106

return child

1107

1108

def serialise(self, store):

1109

"""Serialise the node to store.

1110

1111

:param store: A VersionedFiles honouring the CHK extensions.

1112

:return: An iterable of the keys inserted by this operation.

1113

"""

1114

for node in self._items.itervalues():

1115

if type(node) == tuple:

1116

# Never deserialised.

1117

continue

1118

if node._key is not None:

1119

# Never altered

1120

continue

1121

for key in node.serialise(store):

1122

yield key

1123

lines = ["chknode:\n"]

1124

lines.append("%d\n" % self._maximum_size)

1125

lines.append("%d\n" % self._key_width)

1126

lines.append("%d\n" % self._len)

1127

if self._search_prefix is None:

1128

raise AssertionError("_search_prefix should not be None")

1129

lines.append('%s\n' % (self._search_prefix,))

1130

prefix_len = len(self._search_prefix)

1131

for prefix, node in sorted(self._items.items()):

1132

if type(node) == tuple:

1133

key = node[0]

1134

else:

1135

key = node._key[0]

1136

serialised = "%s\x00%s\n" % (prefix, key)

1137

if not serialised.startswith(self._search_prefix):

1138

raise AssertionError("prefixes mismatch: %s must start with %s"

1139

% (serialised, self._search_prefix))

1140

lines.append(serialised[prefix_len:])

1141

sha1, _, _ = store.add_lines((None,), (), lines)

1142

self._key = ("sha1:" + sha1,)

1143

_page_cache.add(self._key, ''.join(lines))

1144

yield self._key

1145

1146

def _search_key(self, key):

1147

"""Return the serialised key for key in this node."""

1148

# search keys are fixed width. All will be self._node_width wide, so we

1149

# pad as necessary.

1150

return (self._search_key_func(key) + '\x00'*self._node_width)[:self._node_width]

1151

1152

def _search_prefix_filter(self, key):

1153

"""Serialise key for use as a prefix filter in iteritems."""

1154

return self._search_key_func(key)[:self._node_width]

1155

1156

def _split(self, offset):

1157

"""Split this node into smaller nodes starting at offset.

1158

1159

:param offset: The offset to start the new child nodes at.

1160

:return: An iterable of (prefix, node) tuples. prefix is a byte

1161

prefix for reaching node.

1162

"""

1163

if offset >= self._node_width:

1164

for node in self._items.values():

1165

for result in node._split(offset):

1166

yield result

1167

return

1168

for key, node in self._items.items():

1169

pass

1170

1171

def refs(self):

1172

"""Return the references to other CHK's held by this node."""

1173

if self._key is None:

1174

raise AssertionError("unserialised nodes have no refs.")

1175

refs = []

1176

for value in self._items.itervalues():

1177

if type(value) == tuple:

1178

refs.append(value)

1179

else:

1180

refs.append(value.key())

1181

return refs

1182

1183

def _compute_search_prefix(self, extra_key=None):

1184

"""Return the unique key prefix for this node.

1185

1186

:return: A bytestring of the longest search key prefix that is

1187

unique within this node.

1188

"""

1189

self._search_prefix = self.common_prefix_for_keys(self._items)

1190

return self._search_prefix

1191

1192

def unmap(self, store, key, check_remap=True):

1193

"""Remove key from this node and it's children."""

1194

if not len(self._items):

1195

raise AssertionError("can't unmap in an empty InternalNode.")

1196

children = [node for node, _

1197

in self._iter_nodes(store, key_filter=[key])]

1198

if children:

1199

child = children[0]

1200

else:

1201

raise KeyError(key)

1202

self._len -= 1

1203

unmapped = child.unmap(store, key)

1204

self._key = None

1205

search_key = self._search_key(key)

1206

if len(unmapped) == 0:

1207

# All child nodes are gone, remove the child:

1208

del self._items[search_key]

1209

unmapped = None

1210

else:

1211

# Stash the returned node

1212

self._items[search_key] = unmapped

1213

if len(self._items) == 1:

1214

# this node is no longer needed:

1215

return self._items.values()[0]

1216

if type(unmapped) is InternalNode:

1217

return self

1218

if check_remap:

1219

return self._check_remap(store)

1220

else:

1221

return self

1222

1223

def _check_remap(self, store):

1224

"""Check if all keys contained by children fit in a single LeafNode.

1225

1226

:param store: A store to use for reading more nodes

1227

:return: Either self, or a new LeafNode which should replace self.

1228

"""

1229

# Logic for how we determine when we need to rebuild

1230

# 1) Implicitly unmap() is removing a key which means that the child

1231

# nodes are going to be shrinking by some extent.

1232

# 2) If all children are LeafNodes, it is possible that they could be

1233

# combined into a single LeafNode, which can then completely replace

1234

# this internal node with a single LeafNode

1235

# 3) If *one* child is an InternalNode, we assume it has already done

1236

# all the work to determine that its children cannot collapse, and

1237

# we can then assume that those nodes *plus* the current nodes don't

1238

# have a chance of collapsing either.

1239

# So a very cheap check is to just say if 'unmapped' is an

1240

# InternalNode, we don't have to check further.

1241

1242

# TODO: Another alternative is to check the total size of all known

1243

# LeafNodes. If there is some formula we can use to determine the

1244

# final size without actually having to read in any more

1245

# children, it would be nice to have. However, we have to be

1246

# careful with stuff like nodes that pull out the common prefix

1247

# of each key, as adding a new key can change the common prefix

1248

# and cause size changes greater than the length of one key.

1249

# So for now, we just add everything to a new Leaf until it

1250

# splits, as we know that will give the right answer

1251

new_leaf = LeafNode(search_key_func=self._search_key_func)

1252

new_leaf.set_maximum_size(self._maximum_size)

1253

new_leaf._key_width = self._key_width

1254

# A batch_size of 16 was chosen because:

1255

# a) In testing, a 4k page held 14 times. So if we have more than 16

1256

# leaf nodes we are unlikely to hold them in a single new leaf

1257

# node. This still allows for 1 round trip

1258

# b) With 16-way fan out, we can still do a single round trip

1259

# c) With 255-way fan out, we don't want to read all 255 and destroy

1260

# the page cache, just to determine that we really don't need it.

1261

for node, _ in self._iter_nodes(store, batch_size=16):

1262

if type(node) is InternalNode:

1263

# Without looking at any leaf nodes, we are sure

1264

return self

1265

for key, value in node._items.iteritems():

1266

if new_leaf._map_no_split(key, value):

1267

return self

1268

trace.mutter("remap generated a new LeafNode")

1269

return new_leaf

1270

1271

1272

def _deserialise(bytes, key, search_key_func):

1273

"""Helper for repositorydetails - convert bytes to a node."""

1274

if bytes.startswith("chkleaf:\n"):

1275

node = LeafNode.deserialise(bytes, key, search_key_func=search_key_func)

1276

elif bytes.startswith("chknode:\n"):

1277

node = InternalNode.deserialise(bytes, key,

1278

search_key_func=search_key_func)

1279

else:

1280

raise AssertionError("Unknown node type.")

1281

return node

1282

1283

1284

def _find_children_info(store, interesting_keys, uninteresting_keys, pb):

1285

"""Read the associated records, and determine what is interesting."""

1286

uninteresting_keys = set(uninteresting_keys)

1287

chks_to_read = uninteresting_keys.union(interesting_keys)

1288

next_uninteresting = set()

1289

next_interesting = set()

1290

uninteresting_items = set()

1291

interesting_items = set()

1292

interesting_records = []

1293

# records_read = set()

1294

for record in store.get_record_stream(chks_to_read, 'unordered', True):

1295

# records_read.add(record.key())

1296

if pb is not None:

1297

pb.tick()

1298

bytes = record.get_bytes_as('fulltext')

1299

# We don't care about search_key_func for this code, because we only

1300

# care about external references.

1301

node = _deserialise(bytes, record.key, search_key_func=None)

1302

if record.key in uninteresting_keys:

1303

if type(node) is InternalNode:

1304

next_uninteresting.update(node.refs())

1305

else:

1306

# We know we are at a LeafNode, so we can pass None for the

1307

# store

1308

uninteresting_items.update(node.iteritems(None))

1309

else:

1310

interesting_records.append(record)

1311

if type(node) is InternalNode:

1312

next_interesting.update(node.refs())

1313

else:

1314

interesting_items.update(node.iteritems(None))

1315

# TODO: Filter out records that have already been read, as node splitting

1316

# can cause us to reference the same nodes via shorter and longer

1317

# paths

1318

return (next_uninteresting, uninteresting_items,

1319

next_interesting, interesting_records, interesting_items)

1320

1321

1322

def _find_all_uninteresting(store, interesting_root_keys,

1323

uninteresting_root_keys, adapter, pb):

1324

"""Determine the full set of uninteresting keys."""

1325

# What about duplicates between interesting_root_keys and

1326

# uninteresting_root_keys?

1327

if not uninteresting_root_keys:

1328

# Shortcut case. We know there is nothing uninteresting to filter out

1329

# So we just let the rest of the algorithm do the work

1330

# We know there is nothing uninteresting, and we didn't have to read

1331

# any interesting records yet.

1332

return (set(), set(), set(interesting_root_keys), [], set())

1333

all_uninteresting_chks = set(uninteresting_root_keys)

1334

all_uninteresting_items = set()

1335

1336

# First step, find the direct children of both the interesting and

1337

# uninteresting set

1338

(uninteresting_keys, uninteresting_items,

1339

interesting_keys, interesting_records,

1340

interesting_items) = _find_children_info(store, interesting_root_keys,

1341

uninteresting_root_keys,

1342

pb=pb)

1343

all_uninteresting_chks.update(uninteresting_keys)

1344

all_uninteresting_items.update(uninteresting_items)

1345

del uninteresting_items

1346

# Note: Exact matches between interesting and uninteresting do not need

1347

# to be search further. Non-exact matches need to be searched in case

1348

# there is a future exact-match

1349

uninteresting_keys.difference_update(interesting_keys)

1350

1351

# Second, find the full set of uninteresting bits reachable by the

1352

# uninteresting roots

1353

chks_to_read = uninteresting_keys

1354

while chks_to_read:

1355

next_chks = set()

1356

for record in store.get_record_stream(chks_to_read, 'unordered', False):

1357

# TODO: Handle 'absent'

1358

if pb is not None:

1359

pb.tick()

1360

try:

1361

bytes = record.get_bytes_as('fulltext')

1362

except errors.UnavailableRepresentation:

1363

bytes = adapter.get_bytes(record)

1364

# We don't care about search_key_func for this code, because we

1365

# only care about external references.

1366

node = _deserialise(bytes, record.key, search_key_func=None)

1367

if type(node) is InternalNode:

1368

# uninteresting_prefix_chks.update(node._items.iteritems())

1369

chks = node._items.values()

1370

# TODO: We remove the entries that are already in

1371

# uninteresting_chks ?

1372

next_chks.update(chks)

1373

all_uninteresting_chks.update(chks)

1374

else:

1375

all_uninteresting_items.update(node._items.iteritems())

1376

chks_to_read = next_chks

1377

return (all_uninteresting_chks, all_uninteresting_items,

1378

interesting_keys, interesting_records, interesting_items)

1379

1380

1381

def iter_interesting_nodes(store, interesting_root_keys,

1382

uninteresting_root_keys, pb=None):

1383

"""Given root keys, find interesting nodes.

1384

1385

Evaluate nodes referenced by interesting_root_keys. Ones that are also

1386

referenced from uninteresting_root_keys are not considered interesting.

1387

1388

:param interesting_root_keys: keys which should be part of the

1389

"interesting" nodes (which will be yielded)

1390

:param uninteresting_root_keys: keys which should be filtered out of the

1391

result set.

1392

:return: Yield

1393

(interesting records, interesting chk's, interesting key:values)

1394

"""

1395

# TODO: consider that it may be more memory efficient to use the 20-byte

1396

# sha1 string, rather than tuples of hexidecimal sha1 strings.

1397

# TODO: Try to factor out a lot of the get_record_stream() calls into a

1398

# helper function similar to _read_bytes. This function should be

1399

# able to use nodes from the _page_cache as well as actually

1400

# requesting bytes from the store.

1401

1402

# A way to adapt from the compressed texts back into fulltexts

1403

# In a way, this seems like a layering inversion to have CHKMap know the

1404

# details of versionedfile

1405

adapter_class = versionedfile.adapter_registry.get(

1406

('knit-ft-gz', 'fulltext'))

1407

adapter = adapter_class(store)

1408

1409

(all_uninteresting_chks, all_uninteresting_items, interesting_keys,

1410

interesting_records, interesting_items) = _find_all_uninteresting(store,

1411

interesting_root_keys, uninteresting_root_keys, adapter, pb)

1412

1413

# Now that we know everything uninteresting, we can yield information from

1414

# our first request

1415

interesting_items.difference_update(all_uninteresting_items)

1416

records = dict((record.key, record) for record in interesting_records

1417

if record.key not in all_uninteresting_chks)

1418

if records or interesting_items:

1419

yield records, interesting_items

1420

interesting_keys.difference_update(all_uninteresting_chks)

1421

# TODO: We need a test for this

1422

# This handles the case where after a split, one of the child trees

1423

# is identical to one of the interesting root keys. Like if you had a

1424

# leaf node, with "aa" "ab", that then overflowed at "bb". You would

1425

# get a new internal node, but it would have one leaf node with

1426

# ("aa", "ab") and another leaf node with "bb". And you don't want to

1427

# re-transmit that ("aa", "ab") node again

1428

all_uninteresting_chks.update(interesting_root_keys)

1429

1430

chks_to_read = interesting_keys

1431

counter = 0

1432

while chks_to_read:

1433

next_chks = set()

1434

for record in store.get_record_stream(chks_to_read, 'unordered', False):

1435

counter += 1

1436

if pb is not None:

1437

pb.update('find chk pages', counter)

1438

# TODO: Handle 'absent'?

1439

try:

1440

bytes = record.get_bytes_as('fulltext')

1441

except errors.UnavailableRepresentation:

1442

bytes = adapter.get_bytes(record)

1443

# We don't care about search_key_func for this code, because we

1444

# only care about external references.

1445

node = _deserialise(bytes, record.key, search_key_func=None)

1446

if type(node) is InternalNode:

1447

# all_uninteresting_chks grows large, as it lists all nodes we

1448

# don't want to process (including already seen interesting

1449

# nodes).

1450

# small.difference_update(large) scales O(large), but

1451

# small.difference(large) scales O(small).

1452

# Also, we know we just _deserialised this node, so we can

1453

# access the dict directly.

1454

chks = set(node._items.itervalues()).difference(

1455

all_uninteresting_chks)

1456

# Is set() and .difference_update better than:

1457

# chks = [chk for chk in node.refs()

1458

# if chk not in all_uninteresting_chks]

1459

next_chks.update(chks)

1460

# These are now uninteresting everywhere else

1461

all_uninteresting_chks.update(chks)

1462

interesting_items = []

1463

else:

1464

interesting_items = [item for item in node._items.iteritems()

1465

if item not in all_uninteresting_items]

1466

# TODO: Do we need to filter out items that we have already

1467

# seen on other pages? We don't really want to buffer the

1468

# whole thing, but it does mean that callers need to

1469

# understand they may get duplicate values.

1470

# all_uninteresting_items.update(interesting_items)

1471

yield {record.key: record}, interesting_items

1472

chks_to_read = next_chks

1473

1474

1475

try:

1476

from bzrlib._chk_map_pyx import (

1477

_search_key_16,

1478

_search_key_255,

1479

_deserialise_leaf_node,

1480

_deserialise_internal_node,

1481

)

1482

except ImportError:

1483

from bzrlib._chk_map_py import (

1484

_search_key_16,

1485

_search_key_255,

1486

_deserialise_leaf_node,

1487

_deserialise_internal_node,

1488

)

1489

search_key_registry.register('hash-16-way', _search_key_16)

1490

search_key_registry.register('hash-255-way', _search_key_255)