~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/groupcompress.py

Committer: Canonical.com Patch Queue Manager
Date: 2009-09-19 00:32:14 UTC
mfrom: (4685.2.1 bzr.dev)
Revision ID: pqm@pqm.ubuntu.com-20090919003214-2dli9jc4y5xhjj3n

(mbp for garyvdm) Revert rename of
test_merge_uncommitted_otherbasis_ancestor_of_thisbasis.

files added:
bzrlib/doc_generate/sphinx_conf.py

bzrlib/tests/script.py

bzrlib/tests/test_script.py

bzrlib/transport/pathfilter.py

doc/Bazaar-Logo-For-Manuals.png

doc/developers/_static

doc/developers/_static/bzr icon 16.png

doc/developers/_static/bzr.ico

doc/developers/_templates

doc/developers/_templates/layout.html

doc/developers/conf.py

doc/developers/implementation-notes.txt

doc/developers/index-plain.txt

doc/developers/miscellaneous-notes.txt

doc/developers/plans.txt

doc/developers/process.txt

doc/developers/specifications.txt

doc/en/user-guide/index-plain.txt

doc/es/_static

doc/es/_static/bzr icon 16.png

doc/es/_static/bzr.ico

doc/es/_static/es

doc/es/_templates

doc/es/_templates/layout.html

doc/es/conf.py

doc/es/quick-reference/index.txt

doc/es/user-guide/index-plain.txt

doc/index.es.txt

doc/index.ru.txt

doc/ru/_static

doc/ru/_static/bzr icon 16.png

doc/ru/_static/bzr.ico

doc/ru/_static/ru

doc/ru/_templates

doc/ru/_templates/layout.html

doc/ru/conf.py

doc/ru/quick-reference/index.txt

doc/ru/user-guide/index-plain.txt

tools/generate_release_notes.py

tools/package_docs.py

tools/packaging/update-control.sh

files removed:
doc/BUILD-NOTES

doc/_static/en/quick-reference

doc/bazaar-vcs.org.kid

doc/en/developer-guide

doc/en/user-guide/index.txt

doc/es/developer-guide

doc/es/release-notes

doc/es/user-reference

files renamed:
doc/en/developer-guide/HACKING.txt => doc/developers/HACKING.txt

doc/Makefile => doc/en/Makefile

doc/_static/ => doc/en/_static/

doc/_static/en/quick-reference/Makefile => doc/en/_static/en/Makefile

doc/_static/en/quick-reference/bzr-quick-reference.pdf => doc/en/_static/en/bzr-en-quick-reference.pdf

doc/_static/en/quick-reference/bzr-quick-reference.png => doc/en/_static/en/bzr-en-quick-reference.png

doc/_static/en/quick-reference/bzr-quick-reference.svg => doc/en/_static/en/bzr-en-quick-reference.svg

doc/_templates/ => doc/en/_templates/

doc/conf.py => doc/en/conf.py

doc/contents.txt => doc/en/index.txt

doc/make.bat => doc/en/make.bat

doc/en/user-guide/index-for-2x.txt => doc/en/user-guide/index.txt

doc/es/quick-reference/Makefile => doc/es/_static/es/Makefile

doc/es/quick-reference/quick-start-summary.pdf => doc/es/_static/es/bzr-es-quick-reference.pdf

doc/es/quick-reference/quick-start-summary.png => doc/es/_static/es/bzr-es-quick-reference.png

doc/es/quick-reference/quick-start-summary.svg => doc/es/_static/es/bzr-es-quick-reference.svg

doc/index.es.txt => doc/es/index.txt

doc/ru/quick-reference/Makefile => doc/ru/_static/ru/Makefile

doc/ru/quick-reference/quick-start-summary.pdf => doc/ru/_static/ru/bzr-ru-quick-reference.pdf

doc/ru/quick-reference/quick-start-summary.png => doc/ru/_static/ru/bzr-ru-quick-reference.png

doc/ru/quick-reference/quick-start-summary.svg => doc/ru/_static/ru/bzr-ru-quick-reference.svg

doc/index.ru.txt => doc/ru/index.txt

files modified:
.bzrignore

Makefile

NEWS

bzrlib/__init__.py

bzrlib/_btree_serializer_pyx.pyx

bzrlib/_known_graph_py.py

bzrlib/_known_graph_pyx.pyx

bzrlib/annotate.py

bzrlib/bencode.py

bzrlib/btree_index.py

bzrlib/builtins.py

bzrlib/chk_map.py

bzrlib/commands.py

bzrlib/dirstate.py

bzrlib/doc_generate/autodoc_rstx.py

bzrlib/graph.py

bzrlib/groupcompress.py

bzrlib/help_topics/en/configuration.txt

bzrlib/hooks.py

bzrlib/knit.py

bzrlib/lsprof.py

bzrlib/mail_client.py

bzrlib/merge.py

bzrlib/mutabletree.py

bzrlib/osutils.py

bzrlib/plugin.py

bzrlib/registry.py

bzrlib/remote.py

bzrlib/repofmt/groupcompress_repo.py

bzrlib/repofmt/pack_repo.py

bzrlib/repofmt/weaverepo.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/rio.py

bzrlib/smart/medium.py

bzrlib/smart/message.py

bzrlib/smart/protocol.py

bzrlib/smart/repository.py

bzrlib/smart/request.py

bzrlib/smart/server.py

bzrlib/symbol_versioning.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_breakin.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_filesystem_cicp.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_locale.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_send.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_split.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/http_utils.py

bzrlib/tests/per_branch/test_permissions.py

bzrlib/tests/per_branch/test_push.py

bzrlib/tests/per_interrepository/test_fetch.py

bzrlib/tests/per_pack_repository.py

bzrlib/tests/per_repository/test_fileid_involved.py

bzrlib/tests/per_repository/test_reconcile.py

bzrlib/tests/per_repository/test_repository.py

bzrlib/tests/per_repository/test_write_group.py

bzrlib/tests/per_repository_chk/__init__.py

bzrlib/tests/per_repository_chk/test_supported.py

bzrlib/tests/per_repository_reference/test_add_revision.py

bzrlib/tests/per_repository_reference/test_add_signature_text.py

bzrlib/tests/per_workingtree/test_commit.py

bzrlib/tests/per_workingtree/test_content_filters.py

bzrlib/tests/per_workingtree/test_flush.py

bzrlib/tests/per_workingtree/test_locking.py

bzrlib/tests/per_workingtree/test_parents.py

bzrlib/tests/per_workingtree/test_set_root_id.py

bzrlib/tests/test__known_graph.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_crash.py

bzrlib/tests/test_groupcompress.py

bzrlib/tests/test_hooks.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_lsprof.py

bzrlib/tests/test_mail_client.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_mutabletree.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_registry.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_shelf.py

bzrlib/tests/test_smart.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_source.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_version.py

bzrlib/tests/transport_util.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/chroot.py

doc/developers/bug-handling.txt

doc/developers/cycle.txt

doc/developers/dirstate.txt

doc/developers/index.txt

doc/developers/integration.txt

doc/developers/network-protocol.txt

doc/developers/overview.txt

doc/developers/ppa.txt

doc/developers/releasing.txt

doc/developers/testing.txt

doc/en/_templates/index.html

doc/en/_templates/layout.html

doc/en/mini-tutorial/index.txt

doc/en/quick-reference/index.txt

doc/en/tutorials/centralized_workflow.txt

doc/en/tutorials/tutorial.txt

doc/en/tutorials/using_bazaar_with_launchpad.txt

doc/en/upgrade-guide/data_migration.txt

doc/en/upgrade-guide/index.txt

doc/en/user-guide/branching_a_project.txt

doc/en/user-guide/browsing_history.txt

doc/en/user-guide/core_concepts.txt

doc/en/user-guide/getting_help.txt

doc/en/user-guide/installing_bazaar.txt

doc/en/user-guide/merging_changes.txt

doc/en/user-guide/organizing_your_workspace.txt

doc/en/user-guide/plugins.txt

doc/en/user-guide/publishing_a_branch.txt

doc/en/user-guide/server.txt

doc/en/user-guide/undoing_mistakes.txt

doc/en/user-guide/using_checkouts.txt

doc/en/user-guide/writing_a_plugin.txt

doc/es/mini-tutorial/index.txt

doc/es/user-guide/index.txt

doc/index.txt

setup.py

tools/packaging/build-packages.sh

tools/packaging/update-changelogs.sh

tools/packaging/update-packaging-branches.sh

Show diffs side-by-side

added added

removed removed

bzrlib/groupcompress.py

457

# There are code paths that first extract as fulltext, and then

458

# extract as storage_kind (smart fetch). So we don't break the

459

# refcycle here, but instead in manager.get_record_stream()

460

# self._manager = None

461

460

if storage_kind == 'fulltext':

462

461

return self._bytes

463

462

else:

469

468

class _LazyGroupContentManager(object):

470

469

"""This manages a group of _LazyGroupCompressFactory objects."""

471

470

471

_max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of

472

# current size, and still be considered

473

# resuable

474

_full_block_size = 4*1024*1024

475

_full_mixed_block_size = 2*1024*1024

476

_full_enough_block_size = 3*1024*1024 # size at which we won't repack

477

_full_enough_mixed_block_size = 2*768*1024 # 1.5MB

478

472

479

def __init__(self, block):

473

480

self._block = block

474

481

# We need to preserve the ordering

546

553

# time (self._block._content) is a little expensive.

547

554

self._block._ensure_content(self._last_byte)

548

555

549

def _check_rebuild_block(self):

556

def _check_rebuild_action(self):

550

557

"""Check to see if our block should be repacked."""

551

558

total_bytes_used = 0

552

559

last_byte_used = 0

553

560

for factory in self._factories:

554

561

total_bytes_used += factory._end - factory._start

555

last_byte_used = max(last_byte_used, factory._end)

556

# If we are using most of the bytes from the block, we have nothing

557

# else to check (currently more that 1/2)

562

if last_byte_used < factory._end:

563

last_byte_used = factory._end

564

# If we are using more than half of the bytes from the block, we have

565

# nothing else to check

558

566

if total_bytes_used * 2 >= self._block._content_length:

559

return

560

# Can we just strip off the trailing bytes? If we are going to be

561

# transmitting more than 50% of the front of the content, go ahead

567

return None, last_byte_used, total_bytes_used

568

# We are using less than 50% of the content. Is the content we are

569

# using at the beginning of the block? If so, we can just trim the

570

# tail, rather than rebuilding from scratch.

562

571

if total_bytes_used * 2 > last_byte_used:

563

self._trim_block(last_byte_used)

564

return

572

return 'trim', last_byte_used, total_bytes_used

565

573

566

574

# We are using a small amount of the data, and it isn't just packed

567

575

# nicely at the front, so rebuild the content.

574

582

# expanding many deltas into fulltexts, as well.

575

583

# If we build a cheap enough 'strip', then we could try a strip,

576

584

# if that expands the content, we then rebuild.

577

self._rebuild_block()

585

return 'rebuild', last_byte_used, total_bytes_used

586

587

def check_is_well_utilized(self):

588

"""Is the current block considered 'well utilized'?

589

590

This heuristic asks if the current block considers itself to be a fully

591

developed group, rather than just a loose collection of data.

592

"""

593

if len(self._factories) == 1:

594

# A block of length 1 could be improved by combining with other

595

# groups - don't look deeper. Even larger than max size groups

596

# could compress well with adjacent versions of the same thing.

597

return False

598

action, last_byte_used, total_bytes_used = self._check_rebuild_action()

599

block_size = self._block._content_length

600

if total_bytes_used < block_size * self._max_cut_fraction:

601

# This block wants to trim itself small enough that we want to

602

# consider it under-utilized.

603

return False

604

# TODO: This code is meant to be the twin of _insert_record_stream's

605

# 'start_new_block' logic. It would probably be better to factor

606

# out that logic into a shared location, so that it stays

607

# together better

608

# We currently assume a block is properly utilized whenever it is >75%

609

# of the size of a 'full' block. In normal operation, a block is

610

# considered full when it hits 4MB of same-file content. So any block

611

# >3MB is 'full enough'.

612

# The only time this isn't true is when a given block has large-object

613

# content. (a single file >4MB, etc.)

614

# Under these circumstances, we allow a block to grow to

615

# 2 x largest_content. Which means that if a given block had a large

616

# object, it may actually be under-utilized. However, given that this

617

# is 'pack-on-the-fly' it is probably reasonable to not repack large

618

# content blobs on-the-fly. Note that because we return False for all

619

# 1-item blobs, we will repack them; we may wish to reevaluate our

620

# treatment of large object blobs in the future.

621

if block_size >= self._full_enough_block_size:

622

return True

623

# If a block is <3MB, it still may be considered 'full' if it contains

624

# mixed content. The current rule is 2MB of mixed content is considered

625

# full. So check to see if this block contains mixed content, and

626

# set the threshold appropriately.

627

common_prefix = None

628

for factory in self._factories:

629

prefix = factory.key[:-1]

630

if common_prefix is None:

631

common_prefix = prefix

632

elif prefix != common_prefix:

633

# Mixed content, check the size appropriately

634

if block_size >= self._full_enough_mixed_block_size:

635

return True

636

break

637

# The content failed both the mixed check and the single-content check

638

# so obviously it is not fully utilized

639

# TODO: there is one other constraint that isn't being checked

640

# namely, that the entries in the block are in the appropriate

641

# order. For example, you could insert the entries in exactly

642

# reverse groupcompress order, and we would think that is ok.

643

# (all the right objects are in one group, and it is fully

644

# utilized, etc.) For now, we assume that case is rare,

645

# especially since we should always fetch in 'groupcompress'

646

# order.

647

return False

648

649

def _check_rebuild_block(self):

650

action, last_byte_used, total_bytes_used = self._check_rebuild_action()

651

if action is None:

652

return

653

if action == 'trim':

654

self._trim_block(last_byte_used)

655

elif action == 'rebuild':

656

self._rebuild_block()

657

else:

658

raise ValueError('unknown rebuild action: %r' % (action,))

578

659

579

660

def _wire_bytes(self):

580

661

"""Return a byte stream suitable for transmitting over the wire."""

1087

1168

class GroupCompressVersionedFiles(VersionedFiles):

1088

1169

"""A group-compress based VersionedFiles implementation."""

1089

1170

1090

def __init__(self, index, access, delta=True):

1171

def __init__(self, index, access, delta=True, _unadded_refs=None):

1091

1172

"""Create a GroupCompressVersionedFiles object.

1092

1173

1093

1174

:param index: The index object storing access and graph data.

1094

1175

:param access: The access object storing raw data.

1095

1176

:param delta: Whether to delta compress or just entropy compress.

1177

:param _unadded_refs: private parameter, don't use.

1096

1178

"""

1097

1179

self._index = index

1098

1180

self._access = access

1099

1181

self._delta = delta

1100

self._unadded_refs = {}

1182

if _unadded_refs is None:

1183

_unadded_refs = {}

1184

self._unadded_refs = _unadded_refs

1101

1185

self._group_cache = LRUSizeCache(max_size=50*1024*1024)

1102

1186

self._fallback_vfs = []

1103

1187

1188

def without_fallbacks(self):

1189

"""Return a clone of this object without any fallbacks configured."""

1190

return GroupCompressVersionedFiles(self._index, self._access,

1191

self._delta, _unadded_refs=dict(self._unadded_refs))

1192

1104

1193

def add_lines(self, key, parents, lines, parent_texts=None,

1105

1194

left_matching_blocks=None, nostore_sha=None, random_id=False,

1106

1195

check_content=True):

1570

1659

block_length = None

1571

1660

# XXX: TODO: remove this, it is just for safety checking for now

1572

1661

inserted_keys = set()

1662

reuse_this_block = reuse_blocks

1573

1663

for record in stream:

1574

1664

# Raise an error when a record is missing.

1575

1665

if record.storage_kind == 'absent':

1583

1673

if reuse_blocks:

1584

1674

# If the reuse_blocks flag is set, check to see if we can just

1585

1675

# copy a groupcompress block as-is.

1676

# We only check on the first record (groupcompress-block) not

1677

# on all of the (groupcompress-block-ref) entries.

1678

# The reuse_this_block flag is then kept for as long as

1679

if record.storage_kind == 'groupcompress-block':

1680

# Check to see if we really want to re-use this block

1681

insert_manager = record._manager

1682

reuse_this_block = insert_manager.check_is_well_utilized()

1683

else:

1684

reuse_this_block = False

1685

if reuse_this_block:

1686

# We still want to reuse this block

1586

1687

if record.storage_kind == 'groupcompress-block':

1587

1688

# Insert the raw block into the target repo

1588

1689

insert_manager = record._manager

1589

insert_manager._check_rebuild_block()

1590

1690

bytes = record._manager._block.to_bytes()

1591

1691

_, start, length = self._access.add_raw_records(

1592

1692

[(None, len(bytes))], bytes)[0]

1597

1697

'groupcompress-block-ref'):

1598

1698

if insert_manager is None:

1599

1699

raise AssertionError('No insert_manager set')

1700

if insert_manager is not record._manager:

1701

raise AssertionError('insert_manager does not match'

1702

' the current record, we cannot be positive'

1703

' that the appropriate content was inserted.'

1704

)

1600

1705

value = "%d %d %d %d" % (block_start, block_length,

1601

1706

record._start, record._end)

1602

1707

nodes = [(record.key, value, (record.parents,))]

1714

1819

1715

1820

def __init__(self, graph_index, is_locked, parents=True,

1716

1821

add_callback=None, track_external_parent_refs=False,

1717

inconsistency_fatal=True):

1822

inconsistency_fatal=True, track_new_keys=False):

1718

1823

"""Construct a _GCGraphIndex on a graph_index.

1719

1824

1720

1825

:param graph_index: An implementation of bzrlib.index.GraphIndex.

1740

1845

self._is_locked = is_locked

1741

1846

self._inconsistency_fatal = inconsistency_fatal

1742

1847

if track_external_parent_refs:

1743

self._key_dependencies = knit._KeyRefs()

1848

self._key_dependencies = knit._KeyRefs(

1849

track_new_keys=track_new_keys)

1744

1850

else:

1745

1851

self._key_dependencies = None

1746

1852

1800

1906

result.append((key, value))

1801

1907

records = result

1802

1908

key_dependencies = self._key_dependencies

1803

if key_dependencies is not None and self._parents:

1804

for key, value, refs in records:

1805

parents = refs[0]

1806

key_dependencies.add_references(key, parents)

1909

if key_dependencies is not None:

1910

if self._parents:

1911

for key, value, refs in records:

1912

parents = refs[0]

1913

key_dependencies.add_references(key, parents)

1914

else:

1915

for key, value, refs in records:

1916

new_keys.add_key(key)

1807

1917

self._add_callback(records)

1808

1918

1809

1919

def _check_read(self):

1866

1976

"""Return the keys of missing parents."""

1867

1977

# Copied from _KnitGraphIndex.get_missing_parents

1868

1978

# We may have false positives, so filter those out.

1869

self._key_dependencies.add_keys(

1979

self._key_dependencies.satisfy_refs_for_keys(

1870

1980

self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))

1871

1981

return frozenset(self._key_dependencies.get_unsatisfied_refs())

1872

1982

1926

2036

1927

2037

This allows this _GCGraphIndex to keep track of any missing

1928

2038

compression parents we may want to have filled in to make those

1929

indices valid.

2039

indices valid. It also allows _GCGraphIndex to track any new keys.

1930

2040

1931

2041

:param graph_index: A GraphIndex

1932

2042

"""

1933

if self._key_dependencies is not None:

1934

# Add parent refs from graph_index (and discard parent refs that

1935

# the graph_index has).

1936

add_refs = self._key_dependencies.add_references

1937

for node in graph_index.iter_all_entries():

1938

add_refs(node[1], node[3][0])

1939

2043

key_dependencies = self._key_dependencies

2044

if key_dependencies is None:

2045

return

2046

for node in graph_index.iter_all_entries():

2047

# Add parent refs from graph_index (and discard parent refs

2048

# that the graph_index has).

2049

key_dependencies.add_references(node[1], node[3][0])

1940

2050

1941

2051

1942

2052

from bzrlib._groupcompress_py import (

1956

2066

decode_base128_int,

1957

2067

)

1958

2068

GroupCompressor = PyrexGroupCompressor

1959

except ImportError:

2069

except ImportError, e:

2070

osutils.failed_to_load_extension(e)

1960

2071

GroupCompressor = PythonGroupCompressor

1961

2072

Older »