~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/btree_index.py

Committer: John Arbash Meinel
Date: 2008-10-04 14:10:13 UTC
mto: This revision was merged to the branch mainline in revision 3805.
Revision ID: john@arbash-meinel.com-20081004141013-yskxjlwtuy2k18ue

Playing around with expanding requests for btree index nodes into neighboring nodes.

files modified:
bzrlib/btree_index.py

Show diffs side-by-side

added added

removed removed

bzrlib/btree_index.py

631

def _get_root_node(self):

632

if self._root_node is None:

633

# We may not have a root node yet

634

nodes = list(self._read_nodes([0]))

635

if len(nodes):

636

self._root_node = nodes[0][1]

634

self._get_internal_nodes([0])

637

635

return self._root_node

638

636

639

637

def _cache_nodes(self, nodes, cache):

653

651

trace.mutter('Requesting %s > %s nodes, not all will be cached',

654

652

len(nodes), cache._max_cache)

655

653

found = {}

654

start_of_leaves = None

656

655

for node_pos, node in self._read_nodes(sorted(nodes)):

657

656

if node_pos == 0: # Special case

658

657

self._root_node = node

659

658

else:

659

if start_of_leaves is None:

660

start_of_leaves = self._row_offsets[-2]

661

if node_pos < start_of_leaves:

662

self._internal_node_cache.add(node_pos, node)

663

else:

664

self._leaf_node_cache.add(node_pos, node)

660

665

cache.add(node_pos, node)

661

666

found[node_pos] = node

662

667

return found

663

668

669

def _expand_nodes(self, node_indexes):

670

"""Find extra nodes to download.

671

672

The idea is that we always want to make big-enough requests (like 64kB

673

for http), so that we don't waste round trips. So given the entries

674

that we already have cached, and the new nodes being downloaded, figure

675

out what other pages we might want to read.

676

677

:param node_indexes: The nodes that have been requested to read.

678

# :param recommended_size: The size of download we want, this assumes

679

# that readv() is accomplished in a single round trip (which is true

680

# for http and bzr+ssh, and sftp uses async to get close

681

# performance.)

682

# :param max_page: The last possible page to read, possibly max_size is

683

# better?

684

:return: A new list of nodes to download

685

"""

686

trace.note('request: %s\tnodes: %s', self._name[-14:], node_indexes)

687

# return node_indexes

688

689

# TODO: This could be cached

690

recommended_read = self._transport.recommended_page_size() / 4

691

recommended_pages = int(math.ceil(recommended_read /

692

float(_PAGE_SIZE)))

693

if len(node_indexes) >= recommended_pages:

694

# Don't add more, we are already requesting more than enough

695

return node_indexes

696

if self._size is None:

697

# Don't try anything, because we don't know where the file ends

698

return node_indexes

699

# The idea here is to get nodes "next to" the ones we are already

700

# getting.

701

max_page = int(math.ceil(self._size / float(_PAGE_SIZE)))

702

# XXX: Update the LRUCache interface to have a .keys() attribute

703

cached_nodes = set(self._internal_node_cache._cache.keys())

704

cached_nodes.update(self._leaf_node_cache._cache.keys())

705

if self._root_node is not None:

706

cached_nodes.add(0)

707

708

# If reading recommended_pages would read the rest of the index, just

709

# do so.

710

if max_page - len(cached_nodes) <= recommended_pages:

711

# Just read the whole thing

712

# NOTE: this breaks a little bit with the distinction between index

713

# nodes and leaf nodes (as they are cached separately). It is

714

# still worthwhile to read it all, but we need to figure out what

715

# cache we should really put things in when we are done.

716

# However, we may not have read the index header yet to know where

717

# the internal nodes break from where the leaf nodes. We are sure

718

# to know *after* we've done the read.

719

# This also does the wrong thing if there are bloom pages.

720

721

# Further, this gives the wrong results because we have 2 caches to

722

# worry about...

723

if cached_nodes:

724

return [x for x in xrange(max_page) if x not in cached_nodes]

725

return range(max_page)

726

727

needed_nodes = recommended_pages - len(node_indexes)

728

final_nodes = set(node_indexes)

729

if node_indexes == [0]:

730

# Special case when we are reading the root node, just read the

731

# first N pages along with the root node

732

final_nodes = node_indexes

733

# for index in xrange(1, max_page):

734

# if index not in final_nodes and index not in cached_nodes:

735

# final_nodes.add(index)

736

# if len(final_nodes) >= recommended_pages:

737

# break

738

else:

739

while len(final_nodes) < recommended_pages:

740

for pos in sorted(final_nodes):

741

if pos > 0:

742

previous = pos - 1

743

if previous not in cached_nodes:

744

final_nodes.add(previous)

745

if pos < max_page:

746

after = pos + 1

747

if after not in cached_nodes:

748

final_nodes.add(after)

749

# if len(final_nodes) > recommended_pages:

750

# break

751

752

final_nodes = sorted(final_nodes)

753

trace.note('\t\t\t\t%s', final_nodes)

754

return final_nodes

755

664

756

def _get_nodes(self, cache, node_indexes):

665

757

found = {}

666

758

needed = []

672

764

found[idx] = cache[idx]

673

765

except KeyError:

674

766

needed.append(idx)

767

if not needed:

768

return found

769

needed = self._expand_nodes(needed)

675

770

found.update(self._cache_nodes(needed, cache))

676

771

return found

677

772

Older »