~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/groupcompress.py

Committer: Canonical.com Patch Queue Manager
Date: 2010-01-08 01:08:40 UTC
mfrom: (4634.120.5 annotate-objectnotlocked-496590)
mto: (4634.128.1 2.0) (4949.1.1 integration) (4953.1.1 jam-integration) (4957.1.1 jam-integration) (4964.1.1 integration) (4967.1.1 jam-integration) (4981.1.2 2.1.0rc1-dev) (4999.3.1 apport) (4797.17.1 alias-loc-docs-337834-2.1) (5051.4.1 376388-dot-bazaar-ownership) (5045.1.1 369501-version) (5346.4.1 cleanup) (5174.1.1 jam-integration) (5074.5.1 fix-523746-dev)
mto: This revision was merged to the branch mainline in revision 4951.
Revision ID: pqm@pqm.ubuntu.com-20100108010840-tywncpw4p715pwu4

(andrew) Fix ObjectNotLocked error when doing 'bzr annotate -r
branch:...' (#496590)

files added:
bzrlib/crash.py

bzrlib/doc_generate/sphinx_conf.py

bzrlib/tests/features.py

bzrlib/tests/test_crash.py

bzrlib/tests/test_lock.py

bzrlib/tests/test_patches_data/binary-after-normal.patch

bzrlib/tests/test_patches_data/binary.patch

doc/Bazaar-Logo-For-Manuals.png

doc/developers/_static

doc/developers/_static/bzr icon 16.png

doc/developers/_static/bzr.ico

doc/developers/_templates

doc/developers/_templates/layout.html

doc/developers/apport.txt

doc/developers/conf.py

doc/developers/content-filtering.txt

doc/developers/implementation-notes.txt

doc/developers/index-plain.txt

doc/developers/miscellaneous-notes.txt

doc/developers/plans.txt

doc/developers/process.txt

doc/developers/specifications.txt

doc/en/user-guide/index-plain.txt

doc/es/_static

doc/es/_static/bzr icon 16.png

doc/es/_static/bzr.ico

doc/es/_static/es

doc/es/_templates

doc/es/_templates/layout.html

doc/es/conf.py

doc/es/quick-reference/index.txt

doc/es/user-guide/index-plain.txt

doc/index.es.txt

doc/index.ja.txt

doc/index.ru.txt

doc/ja

doc/ja/_static

doc/ja/_static/bzr icon 16.png

doc/ja/_static/bzr.ico

doc/ja/_templates

doc/ja/conf.py

doc/ja/index.txt

doc/ja/mini-tutorial

doc/ja/mini-tutorial/index.txt

doc/ja/tutorials

doc/ja/tutorials/centralized_workflow.txt

doc/ja/tutorials/index.txt

doc/ja/tutorials/tutorial.txt

doc/ja/tutorials/using_bazaar_with_launchpad.txt

doc/ja/upgrade-guide

doc/ja/upgrade-guide/data_migration.txt

doc/ja/upgrade-guide/index.txt

doc/ja/upgrade-guide/overview.txt

doc/ja/upgrade-guide/tips_and_tricks.txt

doc/ja/user-guide

doc/ja/user-guide/adv_merging.txt

doc/ja/user-guide/annotating_changes.txt

doc/ja/user-guide/bazaar_workflows.txt

doc/ja/user-guide/branching_a_project.txt

doc/ja/user-guide/browsing_history.txt

doc/ja/user-guide/bug_trackers.txt

doc/ja/user-guide/bzrtools_plugin.txt

doc/ja/user-guide/central_intro.txt

doc/ja/user-guide/configuring_bazaar.txt

doc/ja/user-guide/controlling_registration.txt

doc/ja/user-guide/core_concepts.txt

doc/ja/user-guide/distributed_intro.txt

doc/ja/user-guide/entering_commands.txt

doc/ja/user-guide/filtered_views.txt

doc/ja/user-guide/getting_help.txt

doc/ja/user-guide/hooks.txt

doc/ja/user-guide/http_smart_server.txt

doc/ja/user-guide/images

doc/ja/user-guide/images/workflows_centralized.png

doc/ja/user-guide/images/workflows_centralized.svg

doc/ja/user-guide/images/workflows_gatekeeper.png

doc/ja/user-guide/images/workflows_gatekeeper.svg

doc/ja/user-guide/images/workflows_localcommit.png

doc/ja/user-guide/images/workflows_localcommit.svg

doc/ja/user-guide/images/workflows_peer.png

doc/ja/user-guide/images/workflows_peer.svg

doc/ja/user-guide/images/workflows_pqm.png

doc/ja/user-guide/images/workflows_pqm.svg

doc/ja/user-guide/images/workflows_shared.png

doc/ja/user-guide/images/workflows_shared.svg

doc/ja/user-guide/images/workflows_single.png

doc/ja/user-guide/images/workflows_single.svg

doc/ja/user-guide/index.txt

doc/ja/user-guide/installing_bazaar.txt

doc/ja/user-guide/introducing_bazaar.txt

doc/ja/user-guide/merging_changes.txt

doc/ja/user-guide/organizing_branches.txt

doc/ja/user-guide/organizing_your_workspace.txt

doc/ja/user-guide/part2_intro.txt

doc/ja/user-guide/partner_intro.txt

doc/ja/user-guide/plugins.txt

doc/ja/user-guide/publishing_a_branch.txt

doc/ja/user-guide/recording_changes.txt

doc/ja/user-guide/releasing_a_project.txt

doc/ja/user-guide/resolving_conflicts.txt

doc/ja/user-guide/reusing_a_checkout.txt

doc/ja/user-guide/reviewing_changes.txt

doc/ja/user-guide/sending_changes.txt

doc/ja/user-guide/server.txt

doc/ja/user-guide/setting_up_email.txt

doc/ja/user-guide/shared_repository_layouts.txt

doc/ja/user-guide/shelving_changes.txt

doc/ja/user-guide/solo_intro.txt

doc/ja/user-guide/specifying_revisions.txt

doc/ja/user-guide/stacked.txt

doc/ja/user-guide/starting_a_project.txt

doc/ja/user-guide/svn_plugin.txt

doc/ja/user-guide/undoing_mistakes.txt

doc/ja/user-guide/using_aliases.txt

doc/ja/user-guide/using_checkouts.txt

doc/ja/user-guide/using_gatekeepers.txt

doc/ja/user-guide/version_info.txt

doc/ja/user-guide/web_browsing.txt

doc/ja/user-guide/working_offline_central.txt

doc/ja/user-guide/writing_a_plugin.txt

doc/ja/user-guide/zen.txt

doc/ja/user-reference

doc/ja/user-reference/index.txt

doc/ru/_static

doc/ru/_static/bzr icon 16.png

doc/ru/_static/bzr.ico

doc/ru/_static/ru

doc/ru/_templates

doc/ru/_templates/layout.html

doc/ru/conf.py

doc/ru/quick-reference/index.txt

doc/ru/user-guide/index-plain.txt

tools/generate_release_notes.py

tools/package_docs.py

files removed:
doc/BUILD-NOTES

doc/_static/en/quick-reference

doc/bazaar-vcs.org.kid

doc/en/developer-guide

doc/en/migration

doc/en/migration/index.txt

doc/en/user-guide/index.txt

doc/es/developer-guide

doc/es/release-notes

doc/es/user-reference

files renamed:
bzrlib/tests/test_pack_repository.py => bzrlib/tests/per_pack_repository.py

bzrlib/tests/test_versionedfile.py => bzrlib/tests/per_versionedfile.py

doc/en/developer-guide/HACKING.txt => doc/developers/HACKING.txt

doc/Makefile => doc/en/Makefile

doc/_static/ => doc/en/_static/

doc/_static/en/quick-reference/Makefile => doc/en/_static/en/Makefile

doc/_static/en/quick-reference/bzr-quick-reference.pdf => doc/en/_static/en/bzr-en-quick-reference.pdf

doc/_static/en/quick-reference/bzr-quick-reference.png => doc/en/_static/en/bzr-en-quick-reference.png

doc/_static/en/quick-reference/bzr-quick-reference.svg => doc/en/_static/en/bzr-en-quick-reference.svg

doc/_templates/ => doc/en/_templates/

doc/conf.py => doc/en/conf.py

doc/contents.txt => doc/en/index.txt

doc/make.bat => doc/en/make.bat

doc/en/user-guide/index-for-2x.txt => doc/en/user-guide/index.txt

doc/es/quick-reference/Makefile => doc/es/_static/es/Makefile

doc/es/quick-reference/quick-start-summary.pdf => doc/es/_static/es/bzr-es-quick-reference.pdf

doc/es/quick-reference/quick-start-summary.png => doc/es/_static/es/bzr-es-quick-reference.png

doc/es/quick-reference/quick-start-summary.svg => doc/es/_static/es/bzr-es-quick-reference.svg

doc/index.es.txt => doc/es/index.txt

doc/ru/quick-reference/Makefile => doc/ru/_static/ru/Makefile

doc/ru/quick-reference/quick-start-summary.pdf => doc/ru/_static/ru/bzr-ru-quick-reference.pdf

doc/ru/quick-reference/quick-start-summary.png => doc/ru/_static/ru/bzr-ru-quick-reference.png

doc/ru/quick-reference/quick-start-summary.svg => doc/ru/_static/ru/bzr-ru-quick-reference.svg

doc/index.ru.txt => doc/ru/index.txt

files modified:
.bzrignore

Makefile

NEWS

README

bzrlib/__init__.py

bzrlib/_annotator_pyx.pyx

bzrlib/_bencode_pyx.pyx

bzrlib/_btree_serializer_pyx.pyx

bzrlib/_chk_map_pyx.pyx

bzrlib/_dirstate_helpers_pyx.pyx

bzrlib/_groupcompress_pyx.pyx

bzrlib/_knit_load_data_pyx.pyx

bzrlib/_known_graph_py.py

bzrlib/_known_graph_pyx.pyx

bzrlib/_readdir_pyx.pyx

bzrlib/_rio_pyx.pyx

bzrlib/_walkdirs_win32.pyx

bzrlib/annotate.py

bzrlib/branch.py

bzrlib/btree_index.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/diff-delta.c

bzrlib/dirstate.py

bzrlib/doc_generate/autodoc_rstx.py

bzrlib/errors.py

bzrlib/export/dir_exporter.py

bzrlib/fetch.py

bzrlib/graph.py

bzrlib/groupcompress.py

bzrlib/help_topics/__init__.py

bzrlib/help_topics/en/debug-flags.txt

bzrlib/hooks.py

bzrlib/index.py

bzrlib/inventory.py

bzrlib/inventory_delta.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge_directive.py

bzrlib/missing.py

bzrlib/mutabletree.py

bzrlib/osutils.py

bzrlib/patches.py

bzrlib/push.py

bzrlib/reconcile.py

bzrlib/remote.py

bzrlib/repofmt/groupcompress_repo.py

bzrlib/repofmt/knitrepo.py

bzrlib/repofmt/pack_repo.py

bzrlib/repofmt/weaverepo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/revisiontree.py

bzrlib/shelf.py

bzrlib/shelf_ui.py

bzrlib/shellcomplete.py

bzrlib/smart/medium.py

bzrlib/smart/protocol.py

bzrlib/smart/repository.py

bzrlib/smart/request.py

bzrlib/status.py

bzrlib/symbol_versioning.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_locale.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/per_branch/__init__.py

bzrlib/tests/per_branch/test_push.py

bzrlib/tests/per_branch/test_stacking.py

bzrlib/tests/per_bzrdir/test_bzrdir.py

bzrlib/tests/per_interbranch/test_push.py

bzrlib/tests/per_interrepository/__init__.py

bzrlib/tests/per_interrepository/test_fetch.py

bzrlib/tests/per_intertree/test_compare.py

bzrlib/tests/per_inventory/__init__.py

bzrlib/tests/per_inventory/basics.py

bzrlib/tests/per_repository/test_add_fallback_repository.py

bzrlib/tests/per_repository/test_check.py

bzrlib/tests/per_repository/test_commit_builder.py

bzrlib/tests/per_repository/test_fileid_involved.py

bzrlib/tests/per_repository/test_reconcile.py

bzrlib/tests/per_repository/test_repository.py

bzrlib/tests/per_repository/test_write_group.py

bzrlib/tests/per_repository_chk/__init__.py

bzrlib/tests/per_repository_chk/test_supported.py

bzrlib/tests/per_repository_reference/test_add_revision.py

bzrlib/tests/per_repository_reference/test_add_signature_text.py

bzrlib/tests/per_repository_reference/test_check.py

bzrlib/tests/per_tree/__init__.py

bzrlib/tests/per_tree/test_path_content_summary.py

bzrlib/tests/per_workingtree/test_basis_inventory.py

bzrlib/tests/per_workingtree/test_commit.py

bzrlib/tests/per_workingtree/test_content_filters.py

bzrlib/tests/per_workingtree/test_executable.py

bzrlib/tests/per_workingtree/test_flush.py

bzrlib/tests/per_workingtree/test_locking.py

bzrlib/tests/per_workingtree/test_parents.py

bzrlib/tests/per_workingtree/test_set_root_id.py

bzrlib/tests/per_workingtree/test_smart_add.py

bzrlib/tests/per_workingtree/test_workingtree.py

bzrlib/tests/test__dirstate_helpers.py

bzrlib/tests/test__known_graph.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_btree_index.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_config.py

bzrlib/tests/test_dirstate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_foreign.py

bzrlib/tests/test_groupcompress.py

bzrlib/tests/test_hooks.py

bzrlib/tests/test_http.py

bzrlib/tests/test_index.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_inventory_delta.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_mutabletree.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_shelf.py

bzrlib/tests/test_shelf_ui.py

bzrlib/tests/test_smart.py

bzrlib/tests/test_source.py

bzrlib/tests/test_ssh_transport.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_tree.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_version_info.py

bzrlib/tests/test_workingtree_4.py

bzrlib/tests/test_xml.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib2_wrappers.py

bzrlib/transport/ssh.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/versionedfile.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

bzrlib/xml5.py

doc/developers/bug-handling.txt

doc/developers/cycle.txt

doc/developers/dirstate.txt

doc/developers/index.txt

doc/developers/integration.txt

doc/developers/inventory.txt

doc/developers/overview.txt

doc/developers/releasing.txt

doc/developers/testing.txt

doc/en/_templates/index.html

doc/en/_templates/layout.html

doc/en/mini-tutorial/index.txt

doc/en/quick-reference/index.txt

doc/en/tutorials/centralized_workflow.txt

doc/en/tutorials/tutorial.txt

doc/en/tutorials/using_bazaar_with_launchpad.txt

doc/en/upgrade-guide/data_migration.txt

doc/en/upgrade-guide/index.txt

doc/en/user-guide/branching_a_project.txt

doc/en/user-guide/browsing_history.txt

doc/en/user-guide/core_concepts.txt

doc/en/user-guide/filtered_views.txt

doc/en/user-guide/installing_bazaar.txt

doc/en/user-guide/introducing_bazaar.txt

doc/en/user-guide/merging_changes.txt

doc/en/user-guide/organizing_your_workspace.txt

doc/en/user-guide/plugins.txt

doc/en/user-guide/publishing_a_branch.txt

doc/en/user-guide/undoing_mistakes.txt

doc/en/user-guide/using_checkouts.txt

doc/es/mini-tutorial/index.txt

doc/es/user-guide/index.txt

doc/index.txt

setup.py

tools/win32/buildout-templates/bin/build-installer.bat.in

tools/win32/buildout.cfg

Show diffs side-by-side

added added

removed removed

bzrlib/groupcompress.py

pack,

trace,

)

from bzrlib.graph import Graph

from bzrlib.btree_index import BTreeBuilder

from bzrlib.lru_cache import LRUSizeCache

from bzrlib.tsort import topo_sort

VersionedFiles,

)

# Minimum number of uncompressed bytes to try fetch at once when retrieving

# groupcompress blocks.

BATCH_SIZE = 2**16

_USE_LZMA = False and (pylzma is not None)

# osutils.sha_string('')

_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'

def sort_gc_optimal(parent_map):

"""Sort and group the keys in parent_map into groupcompress order.

# groupcompress ordering is approximately reverse topological,

# properly grouped by file-id.

per_prefix_map = {}

for item in parent_map.iteritems():

key = item[0]

for key, value in parent_map.iteritems():

if isinstance(key, str) or len(key) == 1:

prefix = ''

else:

prefix = key[0]

try:

per_prefix_map[prefix].append(item)

per_prefix_map[prefix][key] = value

except KeyError:

per_prefix_map[prefix] = [item]

per_prefix_map[prefix] = {key: value}

present_keys = []

for prefix in sorted(per_prefix_map):

456

457

# There are code paths that first extract as fulltext, and then

457

458

# extract as storage_kind (smart fetch). So we don't break the

458

459

# refcycle here, but instead in manager.get_record_stream()

459

# self._manager = None

460

if storage_kind == 'fulltext':

461

return self._bytes

462

else:

468

class _LazyGroupContentManager(object):

469

"""This manages a group of _LazyGroupCompressFactory objects."""

470

471

_max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of

472

# current size, and still be considered

473

# resuable

474

_full_block_size = 4*1024*1024

475

_full_mixed_block_size = 2*1024*1024

476

_full_enough_block_size = 3*1024*1024 # size at which we won't repack

477

_full_enough_mixed_block_size = 2*768*1024 # 1.5MB

478

471

479

def __init__(self, block):

472

480

self._block = block

473

481

# We need to preserve the ordering

545

553

# time (self._block._content) is a little expensive.

546

554

self._block._ensure_content(self._last_byte)

547

555

548

def _check_rebuild_block(self):

556

def _check_rebuild_action(self):

549

557

"""Check to see if our block should be repacked."""

550

558

total_bytes_used = 0

551

559

last_byte_used = 0

552

560

for factory in self._factories:

553

561

total_bytes_used += factory._end - factory._start

554

last_byte_used = max(last_byte_used, factory._end)

555

# If we are using most of the bytes from the block, we have nothing

556

# else to check (currently more that 1/2)

562

if last_byte_used < factory._end:

563

last_byte_used = factory._end

564

# If we are using more than half of the bytes from the block, we have

565

# nothing else to check

557

566

if total_bytes_used * 2 >= self._block._content_length:

558

return

559

# Can we just strip off the trailing bytes? If we are going to be

560

# transmitting more than 50% of the front of the content, go ahead

567

return None, last_byte_used, total_bytes_used

568

# We are using less than 50% of the content. Is the content we are

569

# using at the beginning of the block? If so, we can just trim the

570

# tail, rather than rebuilding from scratch.

561

571

if total_bytes_used * 2 > last_byte_used:

562

self._trim_block(last_byte_used)

563

return

572

return 'trim', last_byte_used, total_bytes_used

564

573

565

574

# We are using a small amount of the data, and it isn't just packed

566

575

# nicely at the front, so rebuild the content.

573

582

# expanding many deltas into fulltexts, as well.

574

583

# If we build a cheap enough 'strip', then we could try a strip,

575

584

# if that expands the content, we then rebuild.

576

self._rebuild_block()

585

return 'rebuild', last_byte_used, total_bytes_used

586

587

def check_is_well_utilized(self):

588

"""Is the current block considered 'well utilized'?

589

590

This heuristic asks if the current block considers itself to be a fully

591

developed group, rather than just a loose collection of data.

592

"""

593

if len(self._factories) == 1:

594

# A block of length 1 could be improved by combining with other

595

# groups - don't look deeper. Even larger than max size groups

596

# could compress well with adjacent versions of the same thing.

597

return False

598

action, last_byte_used, total_bytes_used = self._check_rebuild_action()

599

block_size = self._block._content_length

600

if total_bytes_used < block_size * self._max_cut_fraction:

601

# This block wants to trim itself small enough that we want to

602

# consider it under-utilized.

603

return False

604

# TODO: This code is meant to be the twin of _insert_record_stream's

605

# 'start_new_block' logic. It would probably be better to factor

606

# out that logic into a shared location, so that it stays

607

# together better

608

# We currently assume a block is properly utilized whenever it is >75%

609

# of the size of a 'full' block. In normal operation, a block is

610

# considered full when it hits 4MB of same-file content. So any block

611

# >3MB is 'full enough'.

612

# The only time this isn't true is when a given block has large-object

613

# content. (a single file >4MB, etc.)

614

# Under these circumstances, we allow a block to grow to

615

# 2 x largest_content. Which means that if a given block had a large

616

# object, it may actually be under-utilized. However, given that this

617

# is 'pack-on-the-fly' it is probably reasonable to not repack large

618

# content blobs on-the-fly. Note that because we return False for all

619

# 1-item blobs, we will repack them; we may wish to reevaluate our

620

# treatment of large object blobs in the future.

621

if block_size >= self._full_enough_block_size:

622

return True

623

# If a block is <3MB, it still may be considered 'full' if it contains

624

# mixed content. The current rule is 2MB of mixed content is considered

625

# full. So check to see if this block contains mixed content, and

626

# set the threshold appropriately.

627

common_prefix = None

628

for factory in self._factories:

629

prefix = factory.key[:-1]

630

if common_prefix is None:

631

common_prefix = prefix

632

elif prefix != common_prefix:

633

# Mixed content, check the size appropriately

634

if block_size >= self._full_enough_mixed_block_size:

635

return True

636

break

637

# The content failed both the mixed check and the single-content check

638

# so obviously it is not fully utilized

639

# TODO: there is one other constraint that isn't being checked

640

# namely, that the entries in the block are in the appropriate

641

# order. For example, you could insert the entries in exactly

642

# reverse groupcompress order, and we would think that is ok.

643

# (all the right objects are in one group, and it is fully

644

# utilized, etc.) For now, we assume that case is rare,

645

# especially since we should always fetch in 'groupcompress'

646

# order.

647

return False

648

649

def _check_rebuild_block(self):

650

action, last_byte_used, total_bytes_used = self._check_rebuild_action()

651

if action is None:

652

return

653

if action == 'trim':

654

self._trim_block(last_byte_used)

655

elif action == 'rebuild':

656

self._rebuild_block()

657

else:

658

raise ValueError('unknown rebuild action: %r' % (action,))

577

659

578

660

def _wire_bytes(self):

579

661

"""Return a byte stream suitable for transmitting over the wire."""

975

1057

versioned_files.stream.close()

976

1058

977

1059

1060

class _BatchingBlockFetcher(object):

1061

"""Fetch group compress blocks in batches.

1062

1063

:ivar total_bytes: int of expected number of bytes needed to fetch the

1064

currently pending batch.

1065

"""

1066

1067

def __init__(self, gcvf, locations):

1068

self.gcvf = gcvf

1069

self.locations = locations

1070

self.keys = []

1071

self.batch_memos = {}

1072

self.memos_to_get = []

1073

self.total_bytes = 0

1074

self.last_read_memo = None

1075

self.manager = None

1076

1077

def add_key(self, key):

1078

"""Add another to key to fetch.

1079

1080

:return: The estimated number of bytes needed to fetch the batch so

1081

far.

1082

"""

1083

self.keys.append(key)

1084

index_memo, _, _, _ = self.locations[key]

1085

read_memo = index_memo[0:3]

1086

# Three possibilities for this read_memo:

1087

# - it's already part of this batch; or

1088

# - it's not yet part of this batch, but is already cached; or

1089

# - it's not yet part of this batch and will need to be fetched.

1090

if read_memo in self.batch_memos:

1091

# This read memo is already in this batch.

1092

return self.total_bytes

1093

try:

1094

cached_block = self.gcvf._group_cache[read_memo]

1095

except KeyError:

1096

# This read memo is new to this batch, and the data isn't cached

1097

# either.

1098

self.batch_memos[read_memo] = None

1099

self.memos_to_get.append(read_memo)

1100

byte_length = read_memo[2]

1101

self.total_bytes += byte_length

1102

else:

1103

# This read memo is new to this batch, but cached.

1104

# Keep a reference to the cached block in batch_memos because it's

1105

# certain that we'll use it when this batch is processed, but

1106

# there's a risk that it would fall out of _group_cache between now

1107

# and then.

1108

self.batch_memos[read_memo] = cached_block

1109

return self.total_bytes

1110

1111

def _flush_manager(self):

1112

if self.manager is not None:

1113

for factory in self.manager.get_record_stream():

1114

yield factory

1115

self.manager = None

1116

self.last_read_memo = None

1117

1118

def yield_factories(self, full_flush=False):

1119

"""Yield factories for keys added since the last yield. They will be

1120

returned in the order they were added via add_key.

1121

1122

:param full_flush: by default, some results may not be returned in case

1123

they can be part of the next batch. If full_flush is True, then

1124

all results are returned.

1125

"""

1126

if self.manager is None and not self.keys:

1127

return

1128

# Fetch all memos in this batch.

1129

blocks = self.gcvf._get_blocks(self.memos_to_get)

1130

# Turn blocks into factories and yield them.

1131

memos_to_get_stack = list(self.memos_to_get)

1132

memos_to_get_stack.reverse()

1133

for key in self.keys:

1134

index_memo, _, parents, _ = self.locations[key]

1135

read_memo = index_memo[:3]

1136

if self.last_read_memo != read_memo:

1137

# We are starting a new block. If we have a

1138

# manager, we have found everything that fits for

1139

# now, so yield records

1140

for factory in self._flush_manager():

1141

yield factory

1142

# Now start a new manager.

1143

if memos_to_get_stack and memos_to_get_stack[-1] == read_memo:

1144

# The next block from _get_blocks will be the block we

1145

# need.

1146

block_read_memo, block = blocks.next()

1147

if block_read_memo != read_memo:

1148

raise AssertionError(

1149

"block_read_memo out of sync with read_memo"

1150

"(%r != %r)" % (block_read_memo, read_memo))

1151

self.batch_memos[read_memo] = block

1152

memos_to_get_stack.pop()

1153

else:

1154

block = self.batch_memos[read_memo]

1155

self.manager = _LazyGroupContentManager(block)

1156

self.last_read_memo = read_memo

1157

start, end = index_memo[3:5]

1158

self.manager.add_factory(key, parents, start, end)

1159

if full_flush:

1160

for factory in self._flush_manager():

1161

yield factory

1162

del self.keys[:]

1163

self.batch_memos.clear()

1164

del self.memos_to_get[:]

1165

self.total_bytes = 0

1166

1167

978

1168

class GroupCompressVersionedFiles(VersionedFiles):

979

1169

"""A group-compress based VersionedFiles implementation."""

980

1170

981

def __init__(self, index, access, delta=True):

1171

def __init__(self, index, access, delta=True, _unadded_refs=None):

982

1172

"""Create a GroupCompressVersionedFiles object.

983

1173

984

1174

:param index: The index object storing access and graph data.

985

1175

:param access: The access object storing raw data.

986

1176

:param delta: Whether to delta compress or just entropy compress.

1177

:param _unadded_refs: private parameter, don't use.

987

1178

"""

988

1179

self._index = index

989

1180

self._access = access

990

1181

self._delta = delta

991

self._unadded_refs = {}

1182

if _unadded_refs is None:

1183

_unadded_refs = {}

1184

self._unadded_refs = _unadded_refs

992

1185

self._group_cache = LRUSizeCache(max_size=50*1024*1024)

993

1186

self._fallback_vfs = []

994

1187

1188

def without_fallbacks(self):

1189

"""Return a clone of this object without any fallbacks configured."""

1190

return GroupCompressVersionedFiles(self._index, self._access,

1191

self._delta, _unadded_refs=dict(self._unadded_refs))

1192

995

1193

def add_lines(self, key, parents, lines, parent_texts=None,

996

1194

left_matching_blocks=None, nostore_sha=None, random_id=False,

997

1195

check_content=True):

1099

1297

self._check_lines_not_unicode(lines)

1100

1298

self._check_lines_are_lines(lines)

1101

1299

1300

def get_known_graph_ancestry(self, keys):

1301

"""Get a KnownGraph instance with the ancestry of keys."""

1302

# Note that this is identical to

1303

# KnitVersionedFiles.get_known_graph_ancestry, but they don't share

1304

# ancestry.

1305

parent_map, missing_keys = self._index.find_ancestry(keys)

1306

for fallback in self._fallback_vfs:

1307

if not missing_keys:

1308

break

1309

(f_parent_map, f_missing_keys) = fallback._index.find_ancestry(

1310

missing_keys)

1311

parent_map.update(f_parent_map)

1312

missing_keys = f_missing_keys

1313

kg = _mod_graph.KnownGraph(parent_map)

1314

return kg

1315

1102

1316

def get_parent_map(self, keys):

1103

1317

"""Get a map of the graph parents of keys.

1104

1318

1131

1345

missing.difference_update(set(new_result))

1132

1346

return result, source_results

1133

1347

1134

def _get_block(self, index_memo):

1135

read_memo = index_memo[0:3]

1136

# get the group:

1137

try:

1138

block = self._group_cache[read_memo]

1139

except KeyError:

1140

# read the group

1141

zdata = self._access.get_raw_records([read_memo]).next()

1142

# decompress - whole thing - this is not a bug, as it

1143

# permits caching. We might want to store the partially

1144

# decompresed group and decompress object, so that recent

1145

# texts are not penalised by big groups.

1146

block = GroupCompressBlock.from_bytes(zdata)

1147

self._group_cache[read_memo] = block

1148

# cheapo debugging:

1149

# print len(zdata), len(plain)

1150

# parse - requires split_lines, better to have byte offsets

1151

# here (but not by much - we only split the region for the

1152

# recipe, and we often want to end up with lines anyway.

1153

return block

1348

def _get_blocks(self, read_memos):

1349

"""Get GroupCompressBlocks for the given read_memos.

1350

1351

:returns: a series of (read_memo, block) pairs, in the order they were

1352

originally passed.

1353

"""

1354

cached = {}

1355

for read_memo in read_memos:

1356

try:

1357

block = self._group_cache[read_memo]

1358

except KeyError:

1359

pass

1360

else:

1361

cached[read_memo] = block

1362

not_cached = []

1363

not_cached_seen = set()

1364

for read_memo in read_memos:

1365

if read_memo in cached:

1366

# Don't fetch what we already have

1367

continue

1368

if read_memo in not_cached_seen:

1369

# Don't try to fetch the same data twice

1370

continue

1371

not_cached.append(read_memo)

1372

not_cached_seen.add(read_memo)

1373

raw_records = self._access.get_raw_records(not_cached)

1374

for read_memo in read_memos:

1375

try:

1376

yield read_memo, cached[read_memo]

1377

except KeyError:

1378

# Read the block, and cache it.

1379

zdata = raw_records.next()

1380

block = GroupCompressBlock.from_bytes(zdata)

1381

self._group_cache[read_memo] = block

1382

cached[read_memo] = block

1383

yield read_memo, block

1154

1384

1155

1385

def get_missing_compression_parent_keys(self):

1156

1386

"""Return the keys of missing compression parents.

1322

1552

unadded_keys, source_result)

1323

1553

for key in missing:

1324

1554

yield AbsentContentFactory(key)

1325

manager = None

1326

last_read_memo = None

1327

# TODO: This works fairly well at batching up existing groups into a

1328

# streamable format, and possibly allowing for taking one big

1329

# group and splitting it when it isn't fully utilized.

1330

# However, it doesn't allow us to find under-utilized groups and

1331

# combine them into a bigger group on the fly.

1332

# (Consider the issue with how chk_map inserts texts

1333

# one-at-a-time.) This could be done at insert_record_stream()

1334

# time, but it probably would decrease the number of

1335

# bytes-on-the-wire for fetch.

1555

# Batch up as many keys as we can until either:

1556

# - we encounter an unadded ref, or

1557

# - we run out of keys, or

1558

# - the total bytes to retrieve for this batch > BATCH_SIZE

1559

batcher = _BatchingBlockFetcher(self, locations)

1336

1560

for source, keys in source_keys:

1337

1561

if source is self:

1338

1562

for key in keys:

1339

1563

if key in self._unadded_refs:

1340

if manager is not None:

1341

for factory in manager.get_record_stream():

1342

yield factory

1343

last_read_memo = manager = None

1564

# Flush batch, then yield unadded ref from

1565

# self._compressor.

1566

for factory in batcher.yield_factories(full_flush=True):

1567

yield factory

1344

1568

bytes, sha1 = self._compressor.extract(key)

1345

1569

parents = self._unadded_refs[key]

1346

1570

yield FulltextContentFactory(key, parents, sha1, bytes)

1347

else:

1348

index_memo, _, parents, (method, _) = locations[key]

1349

read_memo = index_memo[0:3]

1350

if last_read_memo != read_memo:

1351

# We are starting a new block. If we have a

1352

# manager, we have found everything that fits for

1353

# now, so yield records

1354

if manager is not None:

1355

for factory in manager.get_record_stream():

1356

yield factory

1357

# Now start a new manager

1358

block = self._get_block(index_memo)

1359

manager = _LazyGroupContentManager(block)

1360

last_read_memo = read_memo

1361

start, end = index_memo[3:5]

1362

manager.add_factory(key, parents, start, end)

1571

continue

1572

if batcher.add_key(key) > BATCH_SIZE:

1573

# Ok, this batch is big enough. Yield some results.

1574

for factory in batcher.yield_factories():

1575

yield factory

1363

1576

else:

1364

if manager is not None:

1365

for factory in manager.get_record_stream():

1366

yield factory

1367

last_read_memo = manager = None

1577

for factory in batcher.yield_factories(full_flush=True):

1578

yield factory

1368

1579

for record in source.get_record_stream(keys, ordering,

1369

1580

include_delta_closure):

1370

1581

yield record

1371

if manager is not None:

1372

for factory in manager.get_record_stream():

1373

yield factory

1582

for factory in batcher.yield_factories(full_flush=True):

1583

yield factory

1374

1584

1375

1585

def get_sha1s(self, keys):

1376

1586

"""See VersionedFiles.get_sha1s()."""

1449

1659

block_length = None

1450

1660

# XXX: TODO: remove this, it is just for safety checking for now

1451

1661

inserted_keys = set()

1662

reuse_this_block = reuse_blocks

1452

1663

for record in stream:

1453

1664

# Raise an error when a record is missing.

1454

1665

if record.storage_kind == 'absent':

1462

1673

if reuse_blocks:

1463

1674

# If the reuse_blocks flag is set, check to see if we can just

1464

1675

# copy a groupcompress block as-is.

1676

# We only check on the first record (groupcompress-block) not

1677

# on all of the (groupcompress-block-ref) entries.

1678

# The reuse_this_block flag is then kept for as long as

1679

if record.storage_kind == 'groupcompress-block':

1680

# Check to see if we really want to re-use this block

1681

insert_manager = record._manager

1682

reuse_this_block = insert_manager.check_is_well_utilized()

1683

else:

1684

reuse_this_block = False

1685

if reuse_this_block:

1686

# We still want to reuse this block

1465

1687

if record.storage_kind == 'groupcompress-block':

1466

1688

# Insert the raw block into the target repo

1467

1689

insert_manager = record._manager

1468

insert_manager._check_rebuild_block()

1469

1690

bytes = record._manager._block.to_bytes()

1470

1691

_, start, length = self._access.add_raw_records(

1471

1692

[(None, len(bytes))], bytes)[0]

1476

1697

'groupcompress-block-ref'):

1477

1698

if insert_manager is None:

1478

1699

raise AssertionError('No insert_manager set')

1700

if insert_manager is not record._manager:

1701

raise AssertionError('insert_manager does not match'

1702

' the current record, we cannot be positive'

1703

' that the appropriate content was inserted.'

1704

)

1479

1705

value = "%d %d %d %d" % (block_start, block_length,

1480

1706

record._start, record._end)

1481

1707

nodes = [(record.key, value, (record.parents,))]

1593

1819

1594

1820

def __init__(self, graph_index, is_locked, parents=True,

1595

1821

add_callback=None, track_external_parent_refs=False,

1596

inconsistency_fatal=True):

1822

inconsistency_fatal=True, track_new_keys=False):

1597

1823

"""Construct a _GCGraphIndex on a graph_index.

1598

1824

1599

1825

:param graph_index: An implementation of bzrlib.index.GraphIndex.

1619

1845

self._is_locked = is_locked

1620

1846

self._inconsistency_fatal = inconsistency_fatal

1621

1847

if track_external_parent_refs:

1622

self._key_dependencies = knit._KeyRefs()

1848

self._key_dependencies = knit._KeyRefs(

1849

track_new_keys=track_new_keys)

1623

1850

else:

1624

1851

self._key_dependencies = None

1625

1852

1679

1906

result.append((key, value))

1680

1907

records = result

1681

1908

key_dependencies = self._key_dependencies

1682

if key_dependencies is not None and self._parents:

1683

for key, value, refs in records:

1684

parents = refs[0]

1685

key_dependencies.add_references(key, parents)

1909

if key_dependencies is not None:

1910

if self._parents:

1911

for key, value, refs in records:

1912

parents = refs[0]

1913

key_dependencies.add_references(key, parents)

1914

else:

1915

for key, value, refs in records:

1916

new_keys.add_key(key)

1686

1917

self._add_callback(records)

1687

1918

1688

1919

def _check_read(self):

1719

1950

if missing_keys:

1720

1951

raise errors.RevisionNotPresent(missing_keys.pop(), self)

1721

1952

1953

def find_ancestry(self, keys):

1954

"""See CombinedGraphIndex.find_ancestry"""

1955

return self._graph_index.find_ancestry(keys, 0)

1956

1722

1957

def get_parent_map(self, keys):

1723

1958

"""Get a map of the parents of keys.

1724

1959

1741

1976

"""Return the keys of missing parents."""

1742

1977

# Copied from _KnitGraphIndex.get_missing_parents

1743

1978

# We may have false positives, so filter those out.

1744

self._key_dependencies.add_keys(

1979

self._key_dependencies.satisfy_refs_for_keys(

1745

1980

self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))

1746

1981

return frozenset(self._key_dependencies.get_unsatisfied_refs())

1747

1982

1801

2036

1802

2037

This allows this _GCGraphIndex to keep track of any missing

1803

2038

compression parents we may want to have filled in to make those

1804

indices valid.

2039

indices valid. It also allows _GCGraphIndex to track any new keys.

1805

2040

1806

2041

:param graph_index: A GraphIndex

1807

2042

"""

1808

if self._key_dependencies is not None:

1809

# Add parent refs from graph_index (and discard parent refs that

1810

# the graph_index has).

1811

add_refs = self._key_dependencies.add_references

1812

for node in graph_index.iter_all_entries():

1813

add_refs(node[1], node[3][0])

1814

2043

key_dependencies = self._key_dependencies

2044

if key_dependencies is None:

2045

return

2046

for node in graph_index.iter_all_entries():

2047

# Add parent refs from graph_index (and discard parent refs

2048

# that the graph_index has).

2049

key_dependencies.add_references(node[1], node[3][0])

1815

2050

1816

2051

1817

2052

from bzrlib._groupcompress_py import (

Older »