~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/groupcompress.py

Committer: Martin Pool
Date: 2011-05-26 23:57:26 UTC
mfrom: (5922 +trunk)
mto: (5425.4.19 220464-stale-locks)
mto: This revision was merged to the branch mainline in revision 5970.
Revision ID: mbp@sourcefrog.net-20110526235726-cal6991rsqi99znr

resolve conflicts against trunk

files added:
bzrlib/export_pot.py

bzrlib/repofmt/knitpack_repo.py

bzrlib/tests/blackbox/test_remember_option.py

bzrlib/tests/per_repository/test_file_graph.py

bzrlib/tests/per_repository_reference/test__make_parents_provider.py

bzrlib/tests/per_repository_vf/test_check.py

bzrlib/tests/per_repository_vf/test_fetch.py

bzrlib/tests/per_repository_vf/test_reconcile.py

bzrlib/tests/per_repository_vf/test_write_group.py

bzrlib/tests/per_tree/test_export.py

bzrlib/tests/test_export_pot.py

bzrlib/tests/test_utextwrap.py

bzrlib/utextwrap.py

bzrlib/vf_repository.py

bzrlib/workingtree_3.py

doc/developers/configuration.txt

doc/en/admin-guide/licence.txt

doc/en/tutorials/licence.txt

doc/en/upgrade-guide/licence.txt

doc/en/user-guide/licence.txt

doc/ja/tutorials/licence.txt

doc/ja/user-guide/index-plain.txt

doc/ja/user-guide/licence.txt

tools/build_mo.py

files removed:
bzrlib/deprecated_graph.py

bzrlib/tests/blackbox/test_breakin.py

bzrlib/tests/per_interbranch/test_update_revisions.py

bzrlib/tests/test_deprecated_graph.py

files renamed:
bzrlib/tests/per_repository/test_is_write_locked.py => bzrlib/tests/per_repository/test_locking.py

bzrlib/tests/per_repository/helpers.py => bzrlib/tests/per_repository_vf/helpers.py

bzrlib/tests/per_repository/test__generate_text_key_index.py => bzrlib/tests/per_repository_vf/test__generate_text_key_index.py

bzrlib/tests/per_repository/test_add_inventory_by_delta.py => bzrlib/tests/per_repository_vf/test_add_inventory_by_delta.py

bzrlib/tests/per_repository/test_find_text_key_references.py => bzrlib/tests/per_repository_vf/test_find_text_key_references.py

files modified:
Makefile

bzrlib/__init__.py

bzrlib/_btree_serializer_pyx.pyx

bzrlib/_dirstate_helpers_pyx.pyx

bzrlib/_groupcompress_pyx.pyx

bzrlib/_walkdirs_win32.pyx

bzrlib/annotate.py

bzrlib/api.py

bzrlib/branch.py

bzrlib/btree_index.py

bzrlib/builtins.py

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/serializer/v08.py

bzrlib/bundle/serializer/v09.py

bzrlib/bzrdir.py

bzrlib/cethread.py

bzrlib/check.py

bzrlib/chk_map.py

bzrlib/chunk_writer.py

bzrlib/cleanup.py

bzrlib/cmd_version_info.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/controldir.py

bzrlib/crash.py

bzrlib/decorators.py

bzrlib/delta.h

bzrlib/diff-delta.c

bzrlib/diff.py

bzrlib/directory_service.py

bzrlib/dirstate.py

bzrlib/doc/api/__init__.py

bzrlib/doc_generate/conf.py

bzrlib/email_message.py

bzrlib/errors.py

bzrlib/export/__init__.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/fetch.py

bzrlib/filters/__init__.py

bzrlib/foreign.py

bzrlib/generate_ids.py

bzrlib/graph.py

bzrlib/groupcompress.py

bzrlib/help_topics/en/configuration.txt

bzrlib/help_topics/en/conflict-types.txt

bzrlib/help_topics/en/rules.txt

bzrlib/hooks.py

bzrlib/ignores.py

bzrlib/index.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lazy_import.py

bzrlib/lazy_regex.py

bzrlib/library_state.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/memorytree.py

bzrlib/merge.py

bzrlib/merge_directive.py

bzrlib/msgeditor.py

bzrlib/multiparent.py

bzrlib/mutabletree.py

bzrlib/osutils.py

bzrlib/patiencediff.py

bzrlib/plugin.py

bzrlib/plugins/changelog_merge/__init__.py

bzrlib/plugins/changelog_merge/changelog_merge.py

bzrlib/plugins/changelog_merge/tests/test_changelog_merge.py

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_propose.py

bzrlib/plugins/netrc_credential_store/__init__.py

bzrlib/plugins/news_merge/__init__.py

bzrlib/plugins/weave_fmt/__init__.py

bzrlib/plugins/weave_fmt/branch.py

bzrlib/plugins/weave_fmt/bzrdir.py

bzrlib/plugins/weave_fmt/repository.py

bzrlib/plugins/weave_fmt/test_bzrdir.py

bzrlib/plugins/weave_fmt/workingtree.py

bzrlib/push.py

bzrlib/reconcile.py

bzrlib/registry.py

bzrlib/remote.py

bzrlib/repofmt/groupcompress_repo.py

bzrlib/repofmt/knitrepo.py

bzrlib/repofmt/pack_repo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisiontree.py

bzrlib/rules.py

bzrlib/send.py

bzrlib/shelf.py

bzrlib/sign_my_commits.py

bzrlib/smart/branch.py

bzrlib/smart/bzrdir.py

bzrlib/smart/client.py

bzrlib/smart/medium.py

bzrlib/smart/protocol.py

bzrlib/smart/request.py

bzrlib/smart/server.py

bzrlib/status.py

bzrlib/switch.py

bzrlib/tag.py

bzrlib/testament.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_clean_tree.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_dpush.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_locale.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_pack.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_reconfigure.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_repair_workingtree.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_rmbranch.py

bzrlib/tests/blackbox/test_send.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_shelve.py

bzrlib/tests/blackbox/test_split.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_switch.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/doc_generate/builders/test_texinfo.py

bzrlib/tests/ftp_server/__init__.py

bzrlib/tests/per_branch/__init__.py

bzrlib/tests/per_branch/test_bound_sftp.py

bzrlib/tests/per_branch/test_branch.py

bzrlib/tests/per_branch/test_check.py

bzrlib/tests/per_branch/test_create_checkout.py

bzrlib/tests/per_branch/test_get_revision_id_to_revno_map.py

bzrlib/tests/per_branch/test_hooks.py

bzrlib/tests/per_branch/test_last_revision_info.py

bzrlib/tests/per_branch/test_push.py

bzrlib/tests/per_branch/test_reconcile.py

bzrlib/tests/per_branch/test_revision_history.py

bzrlib/tests/per_branch/test_sprout.py

bzrlib/tests/per_branch/test_update.py

bzrlib/tests/per_bzrdir/test_bzrdir.py

bzrlib/tests/per_controldir/test_controldir.py

bzrlib/tests/per_controldir_colo/test_supported.py

bzrlib/tests/per_foreign_vcs/test_branch.py

bzrlib/tests/per_foreign_vcs/test_repository.py

bzrlib/tests/per_interbranch/__init__.py

bzrlib/tests/per_interbranch/test_fetch.py

bzrlib/tests/per_interbranch/test_push.py

bzrlib/tests/per_interrepository/__init__.py

bzrlib/tests/per_interrepository/test_fetch.py

bzrlib/tests/per_interrepository/test_interrepository.py

bzrlib/tests/per_intertree/__init__.py

bzrlib/tests/per_repository/__init__.py

bzrlib/tests/per_repository/test_add_fallback_repository.py

bzrlib/tests/per_repository/test_check.py

bzrlib/tests/per_repository/test_commit_builder.py

bzrlib/tests/per_repository/test_fetch.py

bzrlib/tests/per_repository/test_fileid_involved.py

bzrlib/tests/per_repository/test_has_same_location.py

bzrlib/tests/per_repository/test_iter_reverse_revision_history.py

bzrlib/tests/per_repository/test_reconcile.py

bzrlib/tests/per_repository/test_repository.py

bzrlib/tests/per_repository/test_statistics.py

bzrlib/tests/per_repository/test_write_group.py

bzrlib/tests/per_repository_chk/__init__.py

bzrlib/tests/per_repository_reference/__init__.py

bzrlib/tests/per_repository_vf/__init__.py

bzrlib/tests/per_repository_vf/test_check_reconcile.py

bzrlib/tests/per_repository_vf/test_repository.py

bzrlib/tests/per_transport.py

bzrlib/tests/per_tree/__init__.py

bzrlib/tests/per_tree/test_get_file_mtime.py

bzrlib/tests/per_tree/test_get_symlink_target.py

bzrlib/tests/per_tree/test_inv.py

bzrlib/tests/per_tree/test_path_content_summary.py

bzrlib/tests/per_tree/test_test_trees.py

bzrlib/tests/per_versionedfile.py

bzrlib/tests/per_workingtree/__init__.py

bzrlib/tests/per_workingtree/test_add.py

bzrlib/tests/per_workingtree/test_add_reference.py

bzrlib/tests/per_workingtree/test_annotate_iter.py

bzrlib/tests/per_workingtree/test_basis_inventory.py

bzrlib/tests/per_workingtree/test_break_lock.py

bzrlib/tests/per_workingtree/test_check.py

bzrlib/tests/per_workingtree/test_check_state.py

bzrlib/tests/per_workingtree/test_commit.py

bzrlib/tests/per_workingtree/test_executable.py

bzrlib/tests/per_workingtree/test_flush.py

bzrlib/tests/per_workingtree/test_inv.py

bzrlib/tests/per_workingtree/test_is_control_filename.py

bzrlib/tests/per_workingtree/test_is_ignored.py

bzrlib/tests/per_workingtree/test_locking.py

bzrlib/tests/per_workingtree/test_merge_from_branch.py

bzrlib/tests/per_workingtree/test_move.py

bzrlib/tests/per_workingtree/test_nested_specifics.py

bzrlib/tests/per_workingtree/test_parents.py

bzrlib/tests/per_workingtree/test_pull.py

bzrlib/tests/per_workingtree/test_read_working_inventory.py

bzrlib/tests/per_workingtree/test_readonly.py

bzrlib/tests/per_workingtree/test_remove.py

bzrlib/tests/per_workingtree/test_rename_one.py

bzrlib/tests/per_workingtree/test_revision_tree.py

bzrlib/tests/per_workingtree/test_set_root_id.py

bzrlib/tests/per_workingtree/test_smart_add.py

bzrlib/tests/per_workingtree/test_symlinks.py

bzrlib/tests/per_workingtree/test_uncommit.py

bzrlib/tests/per_workingtree/test_unversion.py

bzrlib/tests/per_workingtree/test_workingtree.py

bzrlib/tests/test__dirstate_helpers.py

bzrlib/tests/test__groupcompress.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_atomicfile.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_branchbuilder.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_chk_map.py

bzrlib/tests/test_clean_tree.py

bzrlib/tests/test_commands.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_crash.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_dirstate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_export.py

bzrlib/tests/test_foreign.py

bzrlib/tests/test_ftp_transport.py

bzrlib/tests/test_generate_ids.py

bzrlib/tests/test_globbing.py

bzrlib/tests/test_groupcompress.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_hooks.py

bzrlib/tests/test_http.py

bzrlib/tests/test_ignores.py

bzrlib/tests/test_import_tariff.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lazy_import.py

bzrlib/tests/test_lazy_regex.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_lru_cache.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_mergetools.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_multiparent.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_registry.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revert.py

bzrlib/tests/test_revisionspec.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_rules.py

bzrlib/tests/test_script.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_shelf.py

bzrlib/tests/test_shelf_ui.py

bzrlib/tests/test_smart.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_store.py

bzrlib/tests/test_subsume.py

bzrlib/tests/test_switch.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_treeshape.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_uncommit.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_version.py

bzrlib/tests/test_version_info.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_workingtree_4.py

bzrlib/tests/transport_util.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/http/_urllib2_wrappers.py

bzrlib/transport/local.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/upgrade.py

bzrlib/version_info_formats/format_rio.py

bzrlib/versionedfile.py

bzrlib/win32utils.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

doc/developers/bug-handling.txt

doc/developers/contribution-quickstart.txt

doc/developers/dirstate.txt

doc/developers/index.txt

doc/developers/integration.txt

doc/developers/overview.txt

doc/developers/releasing.txt

doc/developers/revision-properties.txt

doc/developers/testing.txt

doc/en/_static/en/bzr-en-quick-reference.pdf

doc/en/_static/en/bzr-en-quick-reference.png

doc/en/_static/en/bzr-en-quick-reference.svg

doc/en/admin-guide/index-plain.txt

doc/en/admin-guide/index.txt

doc/en/mini-tutorial/index.txt

doc/en/release-notes/bzr-2.1.txt

doc/en/release-notes/bzr-2.3.txt

doc/en/release-notes/bzr-2.4.txt

doc/en/tutorials/centralized_workflow.txt

doc/en/tutorials/index.txt

doc/en/tutorials/tutorial.txt

doc/en/upgrade-guide/index.txt

doc/en/user-guide/branching_a_project.txt

doc/en/user-guide/configuring_bazaar.txt

doc/en/user-guide/index-plain.txt

doc/en/user-guide/index.txt

doc/en/user-guide/merging_changes.txt

doc/en/user-guide/organizing_branches.txt

doc/en/user-guide/publishing_a_branch.txt

doc/en/user-guide/reusing_a_checkout.txt

doc/en/user-guide/stacked.txt

doc/en/user-guide/using_checkouts.txt

doc/en/whats-new/whats-new-in-2.3.txt

doc/en/whats-new/whats-new-in-2.4.txt

doc/ja/index.txt

doc/ja/mini-tutorial/index.txt

doc/ja/tutorials/centralized_workflow.txt

doc/ja/tutorials/index.txt

doc/ja/tutorials/tutorial.txt

doc/ja/tutorials/using_bazaar_with_launchpad.txt

doc/ja/upgrade-guide/data_migration.txt

doc/ja/upgrade-guide/overview.txt

doc/ja/upgrade-guide/tips_and_tricks.txt

doc/ja/user-guide/adv_merging.txt

doc/ja/user-guide/branching_a_project.txt

doc/ja/user-guide/bug_trackers.txt

doc/ja/user-guide/configuring_bazaar.txt

doc/ja/user-guide/entering_commands.txt

doc/ja/user-guide/filtered_views.txt

doc/ja/user-guide/getting_help.txt

doc/ja/user-guide/hooks.txt

doc/ja/user-guide/http_smart_server.txt

doc/ja/user-guide/index.txt

doc/ja/user-guide/installing_bazaar.txt

doc/ja/user-guide/introducing_bazaar.txt

doc/ja/user-guide/merging_changes.txt

doc/ja/user-guide/organizing_branches.txt

doc/ja/user-guide/organizing_your_workspace.txt

doc/ja/user-guide/plugins.txt

doc/ja/user-guide/publishing_a_branch.txt

doc/ja/user-guide/reusing_a_checkout.txt

doc/ja/user-guide/reviewing_changes.txt

doc/ja/user-guide/sending_changes.txt

doc/ja/user-guide/server.txt

doc/ja/user-guide/setting_up_email.txt

doc/ja/user-guide/shared_repository_layouts.txt

doc/ja/user-guide/specifying_revisions.txt

doc/ja/user-guide/stacked.txt

doc/ja/user-guide/svn_plugin.txt

doc/ja/user-guide/undoing_mistakes.txt

doc/ja/user-guide/using_checkouts.txt

doc/ja/user-guide/version_info.txt

doc/ja/user-guide/web_browsing.txt

doc/ja/user-guide/writing_a_plugin.txt

doc/ja/user-reference/index.txt

profile_imports.py

setup.py

tools/win32/py2exe_boot_common.py

Show diffs side-by-side

added added

removed removed

bzrlib/groupcompress.py

except ImportError:

pylzma = None

from bzrlib.lazy_import import lazy_import

lazy_import(globals(), """

from bzrlib import (

annotate,

config,

debug,

errors,

graph as _mod_graph,

knit,

osutils,

pack,

static_tuple,

trace,

tsort,

)

from bzrlib.repofmt import pack_repo

""")

from bzrlib.btree_index import BTreeBuilder

from bzrlib.lru_cache import LRUSizeCache

from bzrlib.tsort import topo_sort

from bzrlib.versionedfile import (

_KeyRefs,

adapter_registry,

AbsentContentFactory,

ChunkedContentFactory,

FulltextContentFactory,

VersionedFiles,

VersionedFilesWithFallbacks,

)

# Minimum number of uncompressed bytes to try fetch at once when retrieving

present_keys = []

for prefix in sorted(per_prefix_map):

present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))

present_keys.extend(reversed(tsort.topo_sort(per_prefix_map[prefix])))

return present_keys

484

491

_full_enough_block_size = 3*1024*1024 # size at which we won't repack

485

492

_full_enough_mixed_block_size = 2*768*1024 # 1.5MB

486

493

487

def __init__(self, block):

494

def __init__(self, block, get_compressor_settings=None):

488

495

self._block = block

489

496

# We need to preserve the ordering

490

497

self._factories = []

491

498

self._last_byte = 0

499

self._get_settings = get_compressor_settings

500

self._compressor_settings = None

501

502

def _get_compressor_settings(self):

503

if self._compressor_settings is not None:

504

return self._compressor_settings

505

settings = None

506

if self._get_settings is not None:

507

settings = self._get_settings()

508

if settings is None:

509

vf = GroupCompressVersionedFiles

510

settings = vf._DEFAULT_COMPRESSOR_SETTINGS

511

self._compressor_settings = settings

512

return self._compressor_settings

492

513

493

514

def add_factory(self, key, parents, start, end):

494

515

if not self._factories:

527

548

new_block.set_content(self._block._content[:last_byte])

528

549

self._block = new_block

529

550

551

def _make_group_compressor(self):

552

return GroupCompressor(self._get_compressor_settings())

553

530

554

def _rebuild_block(self):

531

555

"""Create a new GroupCompressBlock with only the referenced texts."""

532

compressor = GroupCompressor()

556

compressor = self._make_group_compressor()

533

557

tstart = time.time()

534

558

old_length = self._block._content_length

535

559

end_point = 0

547

571

# block? It seems hard to come up with a method that it would

548

572

# expand, since we do full compression again. Perhaps based on a

549

573

# request that ends up poorly ordered?

574

# TODO: If the content would have expanded, then we would want to

575

# handle a case where we need to split the block.

576

# Now that we have a user-tweakable option

577

# (max_bytes_to_index), it is possible that one person set it

578

# to a very low value, causing poor compression.

550

579

delta = time.time() - tstart

551

580

self._block = new_block

552

581

trace.mutter('creating new compressed block on-the-fly in %.3fs'

775

804

776

805

class _CommonGroupCompressor(object):

777

806

778

def __init__(self):

807

def __init__(self, settings=None):

779

808

"""Create a GroupCompressor."""

780

809

self.chunks = []

781

810

self._last = None

784

813

self.labels_deltas = {}

785

814

self._delta_index = None # Set by the children

786

815

self._block = GroupCompressBlock()

816

if settings is None:

817

self._settings = {}

818

else:

819

self._settings = settings

787

820

788

821

def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):

789

822

"""Compress lines with label key.

904

937

905

938

class PythonGroupCompressor(_CommonGroupCompressor):

906

939

907

def __init__(self):

940

def __init__(self, settings=None):

908

941

"""Create a GroupCompressor.

909

942

910

943

Used only if the pyrex version is not available.

911

944

"""

912

super(PythonGroupCompressor, self).__init__()

945

super(PythonGroupCompressor, self).__init__(settings)

913

946

self._delta_index = LinesDeltaIndex([])

914

947

# The actual content is managed by LinesDeltaIndex

915

948

self.chunks = self._delta_index.lines

952

985

953

986

It contains code very similar to SequenceMatcher because of having a similar

954

987

task. However some key differences apply:

955

- there is no junk, we want a minimal edit not a human readable diff.

956

- we don't filter very common lines (because we don't know where a good

957

range will start, and after the first text we want to be emitting minmal

958

edits only.

959

- we chain the left side, not the right side

960

- we incrementally update the adjacency matrix as new lines are provided.

961

- we look for matches in all of the left side, so the routine which does

962

the analagous task of find_longest_match does not need to filter on the

963

left side.

988

989

* there is no junk, we want a minimal edit not a human readable diff.

990

* we don't filter very common lines (because we don't know where a good

991

range will start, and after the first text we want to be emitting minmal

992

edits only.

993

* we chain the left side, not the right side

994

* we incrementally update the adjacency matrix as new lines are provided.

995

* we look for matches in all of the left side, so the routine which does

996

the analagous task of find_longest_match does not need to filter on the

997

left side.

964

998

"""

965

999

966

def __init__(self):

967

super(PyrexGroupCompressor, self).__init__()

968

self._delta_index = DeltaIndex()

1000

def __init__(self, settings=None):

1001

super(PyrexGroupCompressor, self).__init__(settings)

1002

max_bytes_to_index = self._settings.get('max_bytes_to_index', 0)

1003

self._delta_index = DeltaIndex(max_bytes_to_index=max_bytes_to_index)

969

1004

970

1005

def _compress(self, key, bytes, max_delta_size, soft=False):

971

1006

"""see _CommonGroupCompressor._compress"""

1046

1081

index = _GCGraphIndex(graph_index, lambda:True, parents=parents,

1047

1082

add_callback=graph_index.add_nodes,

1048

1083

inconsistency_fatal=inconsistency_fatal)

1049

access = knit._DirectPackAccess({})

1084

access = pack_repo._DirectPackAccess({})

1050

1085

access.set_writer(writer, graph_index, (transport, 'newpack'))

1051

1086

result = GroupCompressVersionedFiles(index, access, delta)

1052

1087

result.stream = stream

1062

1097

1063

1098

class _BatchingBlockFetcher(object):

1064

1099

"""Fetch group compress blocks in batches.

1065

1100

1066

1101

:ivar total_bytes: int of expected number of bytes needed to fetch the

1067

1102

currently pending batch.

1068

1103

"""

1069

1104

1070

def __init__(self, gcvf, locations):

1105

def __init__(self, gcvf, locations, get_compressor_settings=None):

1071

1106

self.gcvf = gcvf

1072

1107

self.locations = locations

1073

1108

self.keys = []

1076

1111

self.total_bytes = 0

1077

1112

self.last_read_memo = None

1078

1113

self.manager = None

1114

self._get_compressor_settings = get_compressor_settings

1079

1115

1080

1116

def add_key(self, key):

1081

1117

"""Add another to key to fetch.

1082

1118

1083

1119

:return: The estimated number of bytes needed to fetch the batch so

1084

1120

far.

1085

1121

"""

1110

1146

# and then.

1111

1147

self.batch_memos[read_memo] = cached_block

1112

1148

return self.total_bytes

1113

1149

1114

1150

def _flush_manager(self):

1115

1151

if self.manager is not None:

1116

1152

for factory in self.manager.get_record_stream():

1121

1157

def yield_factories(self, full_flush=False):

1122

1158

"""Yield factories for keys added since the last yield. They will be

1123

1159

returned in the order they were added via add_key.

1124

1160

1125

1161

:param full_flush: by default, some results may not be returned in case

1126

1162

they can be part of the next batch. If full_flush is True, then

1127

1163

all results are returned.

1155

1191

memos_to_get_stack.pop()

1156

1192

else:

1157

1193

block = self.batch_memos[read_memo]

1158

self.manager = _LazyGroupContentManager(block)

1194

self.manager = _LazyGroupContentManager(block,

1195

get_compressor_settings=self._get_compressor_settings)

1159

1196

self.last_read_memo = read_memo

1160

1197

start, end = index_memo[3:5]

1161

1198

self.manager.add_factory(key, parents, start, end)

1168

1205

self.total_bytes = 0

1169

1206

1170

1207

1171

class GroupCompressVersionedFiles(VersionedFiles):

1208

class GroupCompressVersionedFiles(VersionedFilesWithFallbacks):

1172

1209

"""A group-compress based VersionedFiles implementation."""

1173

1210

1174

def __init__(self, index, access, delta=True, _unadded_refs=None):

1211

# This controls how the GroupCompress DeltaIndex works. Basically, we

1212

# compute hash pointers into the source blocks (so hash(text) => text).

1213

# However each of these references costs some memory in trade against a

1214

# more accurate match result. For very large files, they either are

1215

# pre-compressed and change in bulk whenever they change, or change in just

1216

# local blocks. Either way, 'improved resolution' is not very helpful,

1217

# versus running out of memory trying to track everything. The default max

1218

# gives 100% sampling of a 1MB file.

1219

_DEFAULT_MAX_BYTES_TO_INDEX = 1024 * 1024

1220

_DEFAULT_COMPRESSOR_SETTINGS = {'max_bytes_to_index':

1221

_DEFAULT_MAX_BYTES_TO_INDEX}

1222

1223

def __init__(self, index, access, delta=True, _unadded_refs=None,

1224

_group_cache=None):

1175

1225

"""Create a GroupCompressVersionedFiles object.

1176

1226

1177

1227

:param index: The index object storing access and graph data.

1178

1228

:param access: The access object storing raw data.

1179

1229

:param delta: Whether to delta compress or just entropy compress.

1180

1230

:param _unadded_refs: private parameter, don't use.

1231

:param _group_cache: private parameter, don't use.

1181

1232

"""

1182

1233

self._index = index

1183

1234

self._access = access

1185

1236

if _unadded_refs is None:

1186

1237

_unadded_refs = {}

1187

1238

self._unadded_refs = _unadded_refs

1188

self._group_cache = LRUSizeCache(max_size=50*1024*1024)

1239

if _group_cache is None:

1240

_group_cache = LRUSizeCache(max_size=50*1024*1024)

1241

self._group_cache = _group_cache

1189

1242

self._immediate_fallback_vfs = []

1243

self._max_bytes_to_index = None

1190

1244

1191

1245

def without_fallbacks(self):

1192

1246

"""Return a clone of this object without any fallbacks configured."""

1193

1247

return GroupCompressVersionedFiles(self._index, self._access,

1194

self._delta, _unadded_refs=dict(self._unadded_refs))

1248

self._delta, _unadded_refs=dict(self._unadded_refs),

1249

_group_cache=self._group_cache)

1195

1250

1196

1251

def add_lines(self, key, parents, lines, parent_texts=None,

1197

1252

left_matching_blocks=None, nostore_sha=None, random_id=False,

1201

1256

:param key: The key tuple of the text to add.

1202

1257

:param parents: The parents key tuples of the text to add.

1203

1258

:param lines: A list of lines. Each line must be a bytestring. And all

1204

of them except the last must be terminated with \n and contain no

1205

other \n's. The last line may either contain no \n's or a single

1206

terminating \n. If the lines list does meet this constraint the add

1207

routine may error or may succeed - but you will be unable to read

1208

the data back accurately. (Checking the lines have been split

1259

of them except the last must be terminated with \\n and contain no

1260

other \\n's. The last line may either contain no \\n's or a single

1261

terminating \\n. If the lines list does meet this constraint the

1262

add routine may error or may succeed - but you will be unable to

1263

read the data back accurately. (Checking the lines have been split

1209

1264

correctly is expensive and extremely unlikely to catch bugs so it

1210

1265

is not done at runtime unless check_content is True.)

1211

1266

:param parent_texts: An optional dictionary containing the opaque

1306

1361

self._check_lines_not_unicode(lines)

1307

1362

self._check_lines_are_lines(lines)

1308

1363

1309

def get_known_graph_ancestry(self, keys):

1310

"""Get a KnownGraph instance with the ancestry of keys."""

1311

# Note that this is identical to

1312

# KnitVersionedFiles.get_known_graph_ancestry, but they don't share

1313

# ancestry.

1314

parent_map, missing_keys = self._index.find_ancestry(keys)

1315

for fallback in self._transitive_fallbacks():

1316

if not missing_keys:

1317

break

1318

(f_parent_map, f_missing_keys) = fallback._index.find_ancestry(

1319

missing_keys)

1320

parent_map.update(f_parent_map)

1321

missing_keys = f_missing_keys

1322

kg = _mod_graph.KnownGraph(parent_map)

1323

return kg

1324

1325

1364

def get_parent_map(self, keys):

1326

1365

"""Get a map of the graph parents of keys.

1327

1366

1465

1504

1466

1505

The returned objects should be in the order defined by 'ordering',

1467

1506

which can weave between different sources.

1507

1468

1508

:param ordering: Must be one of 'topological' or 'groupcompress'

1469

1509

:return: List of [(source, [keys])] tuples, such that all keys are in

1470

1510

the defined order, regardless of source.

1471

1511

"""

1472

1512

if ordering == 'topological':

1473

present_keys = topo_sort(parent_map)

1513

present_keys = tsort.topo_sort(parent_map)

1474

1514

else:

1475

1515

# ordering == 'groupcompress'

1476

1516

# XXX: This only optimizes for the target ordering. We may need

1565

1605

# - we encounter an unadded ref, or

1566

1606

# - we run out of keys, or

1567

1607

# - the total bytes to retrieve for this batch > BATCH_SIZE

1568

batcher = _BatchingBlockFetcher(self, locations)

1608

batcher = _BatchingBlockFetcher(self, locations,

1609

get_compressor_settings=self._get_compressor_settings)

1569

1610

for source, keys in source_keys:

1570

1611

if source is self:

1571

1612

for key in keys:

1617

1658

for _ in self._insert_record_stream(stream, random_id=False):

1618

1659

pass

1619

1660

1661

def _get_compressor_settings(self):

1662

if self._max_bytes_to_index is None:

1663

# TODO: VersionedFiles don't know about their containing

1664

# repository, so they don't have much of an idea about their

1665

# location. So for now, this is only a global option.

1666

c = config.GlobalConfig()

1667

val = c.get_user_option('bzr.groupcompress.max_bytes_to_index')

1668

if val is not None:

1669

try:

1670

val = int(val)

1671

except ValueError, e:

1672

trace.warning('Value for '

1673

'"bzr.groupcompress.max_bytes_to_index"'

1674

' %r is not an integer'

1675

% (val,))

1676

val = None

1677

if val is None:

1678

val = self._DEFAULT_MAX_BYTES_TO_INDEX

1679

self._max_bytes_to_index = val

1680

return {'max_bytes_to_index': self._max_bytes_to_index}

1681

1682

def _make_group_compressor(self):

1683

return GroupCompressor(self._get_compressor_settings())

1684

1620

1685

def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,

1621

1686

reuse_blocks=True):

1622

1687

"""Internal core to insert a record stream into this container.

1645

1710

return adapter

1646

1711

# This will go up to fulltexts for gc to gc fetching, which isn't

1647

1712

# ideal.

1648

self._compressor = GroupCompressor()

1713

self._compressor = self._make_group_compressor()

1649

1714

self._unadded_refs = {}

1650

1715

keys_to_add = []

1651

1716

def flush():

1652

1717

bytes_len, chunks = self._compressor.flush().to_chunks()

1653

self._compressor = GroupCompressor()

1718

self._compressor = self._make_group_compressor()

1654

1719

# Note: At this point we still have 1 copy of the fulltext (in

1655

1720

# record and the var 'bytes'), and this generates 2 copies of

1656

1721

# the compressed text (one for bytes, one in chunks)

1921

1986

# repeated over and over, this creates a surplus of ints

1922

1987

self._int_cache = {}

1923

1988

if track_external_parent_refs:

1924

self._key_dependencies = knit._KeyRefs(

1989

self._key_dependencies = _KeyRefs(

1925

1990

track_new_keys=track_new_keys)

1926

1991

else:

1927

1992

self._key_dependencies = None

2067

2132

:param keys: An iterable of keys.

2068

2133

:return: A dict of key:

2069

2134

(index_memo, compression_parent, parents, record_details).

2070

index_memo

2071

opaque structure to pass to read_records to extract the raw

2072

data

2073

compression_parent

2074

Content that this record is built upon, may be None

2075

parents

2076

Logical parents of this node

2077

record_details

2078

extra information about the content which needs to be passed to

2079

Factory.parse_record

2135

2136

* index_memo: opaque structure to pass to read_records to extract

2137

the raw data

2138

* compression_parent: Content that this record is built upon, may

2139

be None

2140

* parents: Logical parents of this node

2141

* record_details: extra information about the content which needs

2142

to be passed to Factory.parse_record

2080

2143

"""

2081

2144

self._check_read()

2082

2145

result = {}

Older »