~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2007-02-09 23:39:24 UTC
mto: This revision was merged to the branch mainline in revision 2294.
Revision ID: john@arbash-meinel.com-20070209233924-k7qbjpta67k3ry2h

Audit Branch to ensure utf8 revision ids.
Requires a small update to lockable_files to allow us to directly
write byte strings, without needing to wrap in a StringIO

files added:
BRANCH.TODO

COPYING.txt

HACKING

INSTALL

Makefile

NEWS.developers

bzr.ico

bzrlib/annotate.py

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_bundle.py

bzrlib/benchmarks/bench_cache_utf8.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_info.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_sftp.py

bzrlib/benchmarks/bench_startup.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/benchmarks/bench_xml.py

bzrlib/benchmarks/tree_creator

bzrlib/benchmarks/tree_creator/__init__.py

bzrlib/benchmarks/tree_creator/heavily_merged.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/benchmarks/tree_creator/many_commit.py

bzrlib/benchmarks/tree_creator/simple_many_commit.py

bzrlib/builtins.py

bzrlib/bundle

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/bundle/serializer/v09.py

bzrlib/bzrdir.py

bzrlib/cache_utf8.py

bzrlib/cmd_version_info.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/debug.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/generate_ids.py

bzrlib/globbing.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help_topics.py

bzrlib/identitymap.py

bzrlib/ignores.py

bzrlib/inspect_for_copy.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lazy_import.py

bzrlib/lazy_regex.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/lsprof.py

bzrlib/memorytree.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/mutabletree.py

bzrlib/option.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/registry.py

bzrlib/repofmt

bzrlib/repository.py

bzrlib/revisionspec.py

bzrlib/revisiontree.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/store

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/HttpServer.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_bundle.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_debug.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_inventory.py

bzrlib/tests/blackbox/test_locale.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_nick.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_version_info.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_hooks.py

bzrlib/tests/branch_implementations/test_http.py

bzrlib/tests/branch_implementations/test_locking.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/intertree_implementations

bzrlib/tests/intertree_implementations/__init__.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/lock_helpers.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/repository_implementations/test_revision.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_atomicfile.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_commands.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_ftp_transport.py

bzrlib/tests/test_generate_docs.py

bzrlib/tests/test_generate_ids.py

bzrlib/tests/test_globbing.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_http_response.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_ignores.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lazy_import.py

bzrlib/tests/test_lazy_regex.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_memorytree.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_osutils_encodings.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_registry.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revert.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tree.py

bzrlib/tests/test_treebuilder.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_version.py

bzrlib/tests/test_version_info.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_wsgi.py

bzrlib/tests/test_xml.py

bzrlib/tests/tree_implementations

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_test_trees.py

bzrlib/tests/tree_implementations/test_tree.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_changes_from.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_flush.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_locking.py

bzrlib/tests/workingtree_implementations/test_merge_from_branch.py

bzrlib/tests/workingtree_implementations/test_mkdir.py

bzrlib/tests/workingtree_implementations/test_parents.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_put_file.py

bzrlib/tests/workingtree_implementations/test_read_working_inventory.py

bzrlib/tests/workingtree_implementations/test_readonly.py

bzrlib/tests/workingtree_implementations/test_set_root_id.py

bzrlib/tests/workingtree_implementations/test_unversion.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textfile.py

bzrlib/textmerge.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/chroot.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_pycurl_errors.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/http/_urllib2_wrappers.py

bzrlib/transport/http/response.py

bzrlib/transport/http/wsgi.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/transport/smart.py

bzrlib/transport/ssh.py

bzrlib/treebuilder.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/urlutils.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/version.py

bzrlib/version_info_formats

bzrlib/version_info_formats/__init__.py

bzrlib/version_info_formats/format_python.py

bzrlib/version_info_formats/format_rio.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml6.py

contrib/bash/bzr

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

doc/README.1st

doc/bazaar-vcs.org.kid

doc/centralized_workflow.txt

doc/configuration.txt

doc/default.css

doc/http_smart_server.txt

doc/index.txt

doc/plugins.txt

doc/server.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/tutorial.txt

doc/using_aliases.txt

doc/version_info.txt

generate_docs.py

profile_imports.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/doc_generate/autodoc_rstx.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/rst2html.py

tools/rst2prettyhtml.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tools/win32

tools/win32/__init__.py

tools/win32/bazaar.url

tools/win32/bzr-win32-bdist-postinstall.py

tools/win32/bzr.iss.cog

tools/win32/bzr_postinstall.py

tools/win32/file_version.py

tools/win32/info.txt

tools/win32/ostools.py

tools/win32/start_bzr.bat

files removed:
bzrlib/mdiff.py

bzrlib/merge_core.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/statcache.py

doc/Makefile

doc/adoption.txt

doc/bitkeeper.txt

doc/changelogs.txt

doc/cherry-picking.txt

doc/cmdref.txt

doc/common-format.txt

doc/compared-aegis.txt

doc/compared-codeville.txt

doc/compared-cvsnt.txt

doc/compared-opencm.txt

doc/compared-prcs.txt

doc/compared-teamware.txt

doc/compression.txt

doc/config-specs.txt

doc/conflicts.txt

doc/costs.txt

doc/darcs.txt

doc/deadly-sins.txt

doc/default.css

doc/design.txt

doc/extra-commands.txt

doc/formats.txt

doc/hashes.txt

doc/ignore.txt

doc/index.txt

doc/interrupted.txt

doc/intro.txt

doc/inventory.txt

doc/join-branches.txt

doc/kill-version.txt

doc/layers.txt

doc/library-interface.txt

doc/merge.txt

doc/mirroring.txt

doc/monotone.txt

doc/news.txt

doc/optional-edit.txt

doc/partial-commit.txt

doc/pool.txt

doc/purpose.txt

doc/python.txt

doc/quilt.txt

doc/quotes.txt

doc/random.txt

doc/requirements.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/revision-syntax.txt

doc/rollup.txt

doc/scalability.txt

doc/security.txt

doc/shared-branches.txt

doc/short-demo.txt

doc/supportability.txt

doc/svk.txt

doc/switch-in-branch.txt

doc/tagging.txt

doc/taxonomy.txt

doc/thanks.txt

doc/todo-from-arch.txt

doc/unchanged.txt

doc/unrelated-merge.txt

doc/usability.txt

doc/use-cases.txt

doc/web-interface.txt

doc/workflow.txt

doc/yaml.txt

notes

notes/new-inventory-sample.xml

notes/performance.txt

patches

patches/symlink-support.patch

testbzr

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

files renamed:
bzrlib/changeset.py => bzrlib/bundle/__init__.py

bzrlib/store.py => bzrlib/store/__init__.py

elementtree/ => bzrlib/util/elementtree/

bzrlib/xml.py => bzrlib/xml_serializer.py

contrib/bash/bzr => contrib/bash/bzr.simple

bzrlib/newinventory.py => contrib/newinventory.py

files modified:
.bzrignore

.rsyncexclude

NEWS

README

TODO

build-api

bzr *

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/util/elementtree/ElementTree.py

bzrlib/workingtree.py

contrib/add-bzr-to-baz

contrib/zsh/_bzr

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

import sys

import warnings

import bzrlib

from bzrlib import (

cache_utf8,

errors,

patiencediff,

progress,

ui,

)

from bzrlib.errors import (

FileExists,

NoSuchFile,

KnitError,

InvalidRevisionId,

KnitCorrupt,

KnitHeaderError,

RevisionNotPresent,

RevisionAlreadyPresent,

)

from bzrlib.tuned_gzip import GzipFile

from bzrlib.trace import mutter

from bzrlib.osutils import (

contains_whitespace,

contains_linebreaks,

sha_strings,

)

from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed

from bzrlib.tsort import topo_sort

import bzrlib.ui

100

import bzrlib.weave

101

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

102

103

104

# TODO: Split out code specific to this format into an associated object.

105

106

# TODO: Can we put in some kind of value to check that the index and data

107

# files belong together?

108

109

# TODO: accommodate binaries, perhaps by storing a byte count

110

111

# TODO: function to check whole file

112

113

# TODO: atomically append data, then measure backwards from the cursor

114

# position after writing to work out where it was located. we may need to

115

# bypass python file buffering.

116

117

DATA_SUFFIX = '.knit'

118

INDEX_SUFFIX = '.kndx'

119

120

121

class KnitContent(object):

122

"""Content of a knit version to which deltas can be applied."""

123

124

def __init__(self, lines):

125

self._lines = lines

126

127

def annotate_iter(self):

128

"""Yield tuples of (origin, text) for each content line."""

129

return iter(self._lines)

130

131

def annotate(self):

132

"""Return a list of (origin, text) tuples."""

133

return list(self.annotate_iter())

134

135

def line_delta_iter(self, new_lines):

136

"""Generate line-based delta from this content to new_lines."""

137

new_texts = new_lines.text()

138

old_texts = self.text()

139

s = KnitSequenceMatcher(None, old_texts, new_texts)

140

for tag, i1, i2, j1, j2 in s.get_opcodes():

141

if tag == 'equal':

142

continue

143

# ofrom, oto, length, data

144

yield i1, i2, j2 - j1, new_lines._lines[j1:j2]

145

146

def line_delta(self, new_lines):

147

return list(self.line_delta_iter(new_lines))

148

149

def text(self):

150

return [text for origin, text in self._lines]

151

152

def copy(self):

153

return KnitContent(self._lines[:])

154

155

156

class _KnitFactory(object):

157

"""Base factory for creating content objects."""

158

159

def make(self, lines, version):

160

num_lines = len(lines)

161

return KnitContent(zip([version] * num_lines, lines))

162

163

164

class KnitAnnotateFactory(_KnitFactory):

165

"""Factory for creating annotated Content objects."""

166

167

annotated = True

168

169

def parse_fulltext(self, content, version):

170

"""Convert fulltext to internal representation

171

172

fulltext content is of the format

173

revid(utf8) plaintext\n

174

internal representation is of the format:

175

(revid, plaintext)

176

"""

177

lines = [line.split(' ', 1) for line in content]

178

return KnitContent(lines)

179

180

def parse_line_delta_iter(self, lines):

181

return iter(self.parse_line_delta(lines))

182

183

def parse_line_delta(self, lines, version):

184

"""Convert a line based delta into internal representation.

185

186

line delta is in the form of:

187

intstart intend intcount

188

1..count lines:

189

revid(utf8) newline\n

190

internal representation is

191

(start, end, count, [1..count tuples (revid, newline)])

192

"""

193

result = []

194

lines = iter(lines)

195

next = lines.next

196

197

# walk through the lines parsing.

198

for header in lines:

199

start, end, count = [int(n) for n in header.split(',')]

200

contents = [next().split(' ', 1) for i in xrange(count)]

201

result.append((start, end, count, contents))

202

return result

203

204

def get_fulltext_content(self, lines):

205

"""Extract just the content lines from a fulltext."""

206

return (line.split(' ', 1)[1] for line in lines)

207

208

def get_linedelta_content(self, lines):

209

"""Extract just the content from a line delta.

210

211

This doesn't return all of the extra information stored in a delta.

212

Only the actual content lines.

213

"""

214

lines = iter(lines)

215

next = lines.next

216

for header in lines:

217

header = header.split(',')

218

count = int(header[2])

219

for i in xrange(count):

220

origin, text = next().split(' ', 1)

221

yield text

222

223

def lower_fulltext(self, content):

224

"""convert a fulltext content record into a serializable form.

225

226

see parse_fulltext which this inverts.

227

"""

228

encode_utf8 = cache_utf8.encode

229

return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]

230

231

def lower_line_delta(self, delta):

232

"""convert a delta into a serializable form.

233

234

See parse_line_delta which this inverts.

235

"""

236

encode_utf8 = cache_utf8.encode

237

out = []

238

for start, end, c, lines in delta:

239

out.append('%d,%d,%d\n' % (start, end, c))

240

out.extend(encode_utf8(origin) + ' ' + text

241

for origin, text in lines)

242

return out

243

244

245

class KnitPlainFactory(_KnitFactory):

246

"""Factory for creating plain Content objects."""

247

248

annotated = False

249

250

def parse_fulltext(self, content, version):

251

"""This parses an unannotated fulltext.

252

253

Note that this is not a noop - the internal representation

254

has (versionid, line) - its just a constant versionid.

255

"""

256

return self.make(content, version)

257

258

def parse_line_delta_iter(self, lines, version):

259

cur = 0

260

num_lines = len(lines)

261

while cur < num_lines:

262

header = lines[cur]

263

cur += 1

264

start, end, c = [int(n) for n in header.split(',')]

265

yield start, end, c, zip([version] * c, lines[cur:cur+c])

266

cur += c

267

268

def parse_line_delta(self, lines, version):

269

return list(self.parse_line_delta_iter(lines, version))

270

271

def get_fulltext_content(self, lines):

272

"""Extract just the content lines from a fulltext."""

273

return iter(lines)

274

275

def get_linedelta_content(self, lines):

276

"""Extract just the content from a line delta.

277

278

This doesn't return all of the extra information stored in a delta.

279

Only the actual content lines.

280

"""

281

lines = iter(lines)

282

next = lines.next

283

for header in lines:

284

header = header.split(',')

285

count = int(header[2])

286

for i in xrange(count):

287

yield next()

288

289

def lower_fulltext(self, content):

290

return content.text()

291

292

def lower_line_delta(self, delta):

293

out = []

294

for start, end, c, lines in delta:

295

out.append('%d,%d,%d\n' % (start, end, c))

296

out.extend([text for origin, text in lines])

297

return out

298

299

300

def make_empty_knit(transport, relpath):

301

"""Construct a empty knit at the specified location."""

302

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

303

k._data._open_file()

304

305

306

class KnitVersionedFile(VersionedFile):

307

"""Weave-like structure with faster random access.

308

309

A knit stores a number of texts and a summary of the relationships

310

between them. Texts are identified by a string version-id. Texts

311

are normally stored and retrieved as a series of lines, but can

312

also be passed as single strings.

313

314

Lines are stored with the trailing newline (if any) included, to

315

avoid special cases for files with no final newline. Lines are

316

composed of 8-bit characters, not unicode. The combination of

317

these approaches should mean any 'binary' file can be safely

318

stored and retrieved.

319

"""

320

321

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

322

factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,

323

create=False, create_parent_dir=False, delay_create=False,

324

dir_mode=None):

325

"""Construct a knit at location specified by relpath.

326

327

:param create: If not True, only open an existing knit.

328

:param create_parent_dir: If True, create the parent directory if

329

creating the file fails. (This is used for stores with

330

hash-prefixes that may not exist yet)

331

:param delay_create: The calling code is aware that the knit won't

332

actually be created until the first data is stored.

333

"""

334

if deprecated_passed(basis_knit):

335

warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"

336

" deprecated as of bzr 0.9.",

337

DeprecationWarning, stacklevel=2)

338

if access_mode is None:

339

access_mode = 'w'

340

super(KnitVersionedFile, self).__init__(access_mode)

341

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

342

self.transport = transport

343

self.filename = relpath

344

self.factory = factory or KnitAnnotateFactory()

345

self.writable = (access_mode == 'w')

346

self.delta = delta

347

348

self._max_delta_chain = 200

349

350

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

351

access_mode, create=create, file_mode=file_mode,

352

create_parent_dir=create_parent_dir, delay_create=delay_create,

353

dir_mode=dir_mode)

354

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

355

access_mode, create=create and not len(self), file_mode=file_mode,

356

create_parent_dir=create_parent_dir, delay_create=delay_create,

357

dir_mode=dir_mode)

358

359

def __repr__(self):

360

return '%s(%s)' % (self.__class__.__name__,

361

self.transport.abspath(self.filename))

362

363

def _check_should_delta(self, first_parents):

364

"""Iterate back through the parent listing, looking for a fulltext.

365

366

This is used when we want to decide whether to add a delta or a new

367

fulltext. It searches for _max_delta_chain parents. When it finds a

368

fulltext parent, it sees if the total size of the deltas leading up to

369

it is large enough to indicate that we want a new full text anyway.

370

371

Return True if we should create a new delta, False if we should use a

372

full text.

373

"""

374

delta_size = 0

375

fulltext_size = None

376

delta_parents = first_parents

377

for count in xrange(self._max_delta_chain):

378

parent = delta_parents[0]

379

method = self._index.get_method(parent)

380

pos, size = self._index.get_position(parent)

381

if method == 'fulltext':

382

fulltext_size = size

383

break

384

delta_size += size

385

delta_parents = self._index.get_parents(parent)

386

else:

387

# We couldn't find a fulltext, so we must create a new one

388

return False

389

390

return fulltext_size > delta_size

391

392

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

393

"""See VersionedFile._add_delta()."""

394

self._check_add(version_id, []) # should we check the lines ?

395

self._check_versions_present(parents)

396

present_parents = []

397

ghosts = []

398

parent_texts = {}

399

for parent in parents:

400

if not self.has_version(parent):

401

ghosts.append(parent)

402

else:

403

present_parents.append(parent)

404

405

if delta_parent is None:

406

# reconstitute as full text.

407

assert len(delta) == 1 or len(delta) == 0

408

if len(delta):

409

assert delta[0][0] == 0

410

assert delta[0][1] == 0, delta[0][1]

411

return super(KnitVersionedFile, self)._add_delta(version_id,

412

parents,

413

delta_parent,

414

sha1,

415

noeol,

416

delta)

417

418

digest = sha1

419

420

options = []

421

if noeol:

422

options.append('no-eol')

423

424

if delta_parent is not None:

425

# determine the current delta chain length.

426

# To speed the extract of texts the delta chain is limited

427

# to a fixed number of deltas. This should minimize both

428

# I/O and the time spend applying deltas.

429

# The window was changed to a maximum of 200 deltas, but also added

430

# was a check that the total compressed size of the deltas is

431

# smaller than the compressed size of the fulltext.

432

if not self._check_should_delta([delta_parent]):

433

# We don't want a delta here, just do a normal insertion.

434

return super(KnitVersionedFile, self)._add_delta(version_id,

435

parents,

436

delta_parent,

437

sha1,

438

noeol,

439

delta)

440

441

options.append('line-delta')

442

store_lines = self.factory.lower_line_delta(delta)

443

444

where, size = self._data.add_record(version_id, digest, store_lines)

445

self._index.add_version(version_id, options, where, size, parents)

446

447

def _add_raw_records(self, records, data):

448

"""Add all the records 'records' with data pre-joined in 'data'.

449

450

:param records: A list of tuples(version_id, options, parents, size).

451

:param data: The data for the records. When it is written, the records

452

are adjusted to have pos pointing into data by the sum of

453

the preceding records sizes.

454

"""

455

# write all the data

456

pos = self._data.add_raw_record(data)

457

offset = 0

458

index_entries = []

459

for (version_id, options, parents, size) in records:

460

index_entries.append((version_id, options, pos+offset,

461

size, parents))

462

if self._data._do_cache:

463

self._data._cache[version_id] = data[offset:offset+size]

464

offset += size

465

self._index.add_versions(index_entries)

466

467

def enable_cache(self):

468

"""Start caching data for this knit"""

469

self._data.enable_cache()

470

471

def clear_cache(self):

472

"""Clear the data cache only."""

473

self._data.clear_cache()

474

475

def copy_to(self, name, transport):

476

"""See VersionedFile.copy_to()."""

477

# copy the current index to a temp index to avoid racing with local

478

# writes

479

transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',

480

self.transport.get(self._index._filename))

481

# copy the data file

482

f = self._data._open_file()

483

try:

484

transport.put_file(name + DATA_SUFFIX, f)

485

finally:

486

f.close()

487

# move the copied index into place

488

transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

489

490

def create_empty(self, name, transport, mode=None):

491

return KnitVersionedFile(name, transport, factory=self.factory,

492

delta=self.delta, create=True)

493

494

def _fix_parents(self, version, new_parents):

495

"""Fix the parents list for version.

496

497

This is done by appending a new version to the index

498

with identical data except for the parents list.

499

the parents list must be a superset of the current

500

list.

501

"""

502

current_values = self._index._cache[version]

503

assert set(current_values[4]).difference(set(new_parents)) == set()

504

self._index.add_version(version,

505

current_values[1],

506

current_values[2],

507

current_values[3],

508

new_parents)

509

510

def get_delta(self, version_id):

511

"""Get a delta for constructing version from some other version."""

512

self.check_not_reserved_id(version_id)

513

if not self.has_version(version_id):

514

raise RevisionNotPresent(version_id, self.filename)

515

516

parents = self.get_parents(version_id)

517

if len(parents):

518

parent = parents[0]

519

else:

520

parent = None

521

data_pos, data_size = self._index.get_position(version_id)

522

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

523

version_idx = self._index.lookup(version_id)

524

noeol = 'no-eol' in self._index.get_options(version_id)

525

if 'fulltext' == self._index.get_method(version_id):

526

new_content = self.factory.parse_fulltext(data, version_idx)

527

if parent is not None:

528

reference_content = self._get_content(parent)

529

old_texts = reference_content.text()

530

else:

531

old_texts = []

532

new_texts = new_content.text()

533

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

534

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

535

else:

536

delta = self.factory.parse_line_delta(data, version_idx)

537

return parent, sha1, noeol, delta

538

539

def get_graph_with_ghosts(self):

540

"""See VersionedFile.get_graph_with_ghosts()."""

541

graph_items = self._index.get_graph()

542

return dict(graph_items)

543

544

def get_sha1(self, version_id):

545

"""See VersionedFile.get_sha1()."""

546

record_map = self._get_record_map([version_id])

547

method, content, digest, next = record_map[version_id]

548

return digest

549

550

@staticmethod

551

def get_suffixes():

552

"""See VersionedFile.get_suffixes()."""

553

return [DATA_SUFFIX, INDEX_SUFFIX]

554

555

def has_ghost(self, version_id):

556

"""True if there is a ghost reference in the file to version_id."""

557

# maybe we have it

558

if self.has_version(version_id):

559

return False

560

# optimisable if needed by memoising the _ghosts set.

561

items = self._index.get_graph()

562

for node, parents in items:

563

for parent in parents:

564

if parent not in self._index._cache:

565

if parent == version_id:

566

return True

567

return False

568

569

def versions(self):

570

"""See VersionedFile.versions."""

571

return self._index.get_versions()

572

573

def has_version(self, version_id):

574

"""See VersionedFile.has_version."""

575

return self._index.has_version(version_id)

576

577

__contains__ = has_version

578

579

def _merge_annotations(self, content, parents, parent_texts={},

580

delta=None, annotated=None):

581

"""Merge annotations for content. This is done by comparing

582

the annotations based on changed to the text.

583

"""

584

if annotated:

585

delta_seq = None

586

for parent_id in parents:

587

merge_content = self._get_content(parent_id, parent_texts)

588

seq = patiencediff.PatienceSequenceMatcher(

589

None, merge_content.text(), content.text())

590

if delta_seq is None:

591

# setup a delta seq to reuse.

592

delta_seq = seq

593

for i, j, n in seq.get_matching_blocks():

594

if n == 0:

595

continue

596

# this appears to copy (origin, text) pairs across to the new

597

# content for any line that matches the last-checked parent.

598

# FIXME: save the sequence control data for delta compression

599

# against the most relevant parent rather than rediffing.

600

content._lines[j:j+n] = merge_content._lines[i:i+n]

601

if delta:

602

if not annotated:

603

reference_content = self._get_content(parents[0], parent_texts)

604

new_texts = content.text()

605

old_texts = reference_content.text()

606

delta_seq = patiencediff.PatienceSequenceMatcher(

607

None, old_texts, new_texts)

608

return self._make_line_delta(delta_seq, content)

609

610

def _make_line_delta(self, delta_seq, new_content):

611

"""Generate a line delta from delta_seq and new_content."""

612

diff_hunks = []

613

for op in delta_seq.get_opcodes():

614

if op[0] == 'equal':

615

continue

616

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

617

return diff_hunks

618

619

def _get_components_positions(self, version_ids):

620

"""Produce a map of position data for the components of versions.

621

622

This data is intended to be used for retrieving the knit records.

623

624

A dict of version_id to (method, data_pos, data_size, next) is

625

returned.

626

method is the way referenced data should be applied.

627

data_pos is the position of the data in the knit.

628

data_size is the size of the data in the knit.

629

next is the build-parent of the version, or None for fulltexts.

630

"""

631

component_data = {}

632

for version_id in version_ids:

633

cursor = version_id

634

635

while cursor is not None and cursor not in component_data:

636

method = self._index.get_method(cursor)

637

if method == 'fulltext':

638

next = None

639

else:

640

next = self.get_parents(cursor)[0]

641

data_pos, data_size = self._index.get_position(cursor)

642

component_data[cursor] = (method, data_pos, data_size, next)

643

cursor = next

644

return component_data

645

646

def _get_content(self, version_id, parent_texts={}):

647

"""Returns a content object that makes up the specified

648

version."""

649

if not self.has_version(version_id):

650

raise RevisionNotPresent(version_id, self.filename)

651

652

cached_version = parent_texts.get(version_id, None)

653

if cached_version is not None:

654

return cached_version

655

656

text_map, contents_map = self._get_content_maps([version_id])

657

return contents_map[version_id]

658

659

def _check_versions_present(self, version_ids):

660

"""Check that all specified versions are present."""

661

self._index.check_versions_present(version_ids)

662

663

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

664

"""See VersionedFile.add_lines_with_ghosts()."""

665

self._check_add(version_id, lines)

666

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

667

668

def _add_lines(self, version_id, parents, lines, parent_texts):

669

"""See VersionedFile.add_lines."""

670

self._check_add(version_id, lines)

671

self._check_versions_present(parents)

672

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

673

674

def _check_add(self, version_id, lines):

675

"""check that version_id and lines are safe to add."""

676

assert self.writable, "knit is not opened for write"

677

### FIXME escape. RBC 20060228

678

if contains_whitespace(version_id):

679

raise InvalidRevisionId(version_id, self.filename)

680

self.check_not_reserved_id(version_id)

681

if self.has_version(version_id):

682

raise RevisionAlreadyPresent(version_id, self.filename)

683

self._check_lines_not_unicode(lines)

684

self._check_lines_are_lines(lines)

685

686

def _add(self, version_id, lines, parents, delta, parent_texts):

687

"""Add a set of lines on top of version specified by parents.

688

689

If delta is true, compress the text as a line-delta against

690

the first parent.

691

692

Any versions not present will be converted into ghosts.

693

"""

694

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

695

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

696

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

697

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

698

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

699

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

700

# +1383 0 8.0370 8.0370 +<len>

701

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

702

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

703

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

704

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

705

706

present_parents = []

707

ghosts = []

708

if parent_texts is None:

709

parent_texts = {}

710

for parent in parents:

711

if not self.has_version(parent):

712

ghosts.append(parent)

713

else:

714

present_parents.append(parent)

715

716

if delta and not len(present_parents):

717

delta = False

718

719

digest = sha_strings(lines)

720

options = []

721

if lines:

722

if lines[-1][-1] != '\n':

723

options.append('no-eol')

724

lines[-1] = lines[-1] + '\n'

725

726

if len(present_parents) and delta:

727

# To speed the extract of texts the delta chain is limited

728

# to a fixed number of deltas. This should minimize both

729

# I/O and the time spend applying deltas.

730

delta = self._check_should_delta(present_parents)

731

732

lines = self.factory.make(lines, version_id)

733

if delta or (self.factory.annotated and len(present_parents) > 0):

734

# Merge annotations from parent texts if so is needed.

735

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

736

delta, self.factory.annotated)

737

738

if delta:

739

options.append('line-delta')

740

store_lines = self.factory.lower_line_delta(delta_hunks)

741

else:

742

options.append('fulltext')

743

store_lines = self.factory.lower_fulltext(lines)

744

745

where, size = self._data.add_record(version_id, digest, store_lines)

746

self._index.add_version(version_id, options, where, size, parents)

747

return lines

748

749

def check(self, progress_bar=None):

750

"""See VersionedFile.check()."""

751

752

def _clone_text(self, new_version_id, old_version_id, parents):

753

"""See VersionedFile.clone_text()."""

754

# FIXME RBC 20060228 make fast by only inserting an index with null

755

# delta.

756

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

757

758

def get_lines(self, version_id):

759

"""See VersionedFile.get_lines()."""

760

return self.get_line_list([version_id])[0]

761

762

def _get_record_map(self, version_ids):

763

"""Produce a dictionary of knit records.

764

765

The keys are version_ids, the values are tuples of (method, content,

766

digest, next).

767

method is the way the content should be applied.

768

content is a KnitContent object.

769

digest is the SHA1 digest of this version id after all steps are done

770

next is the build-parent of the version, i.e. the leftmost ancestor.

771

If the method is fulltext, next will be None.

772

"""

773

position_map = self._get_components_positions(version_ids)

774

# c = component_id, m = method, p = position, s = size, n = next

775

records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]

776

record_map = {}

777

for component_id, content, digest in \

778

self._data.read_records_iter(records):

779

method, position, size, next = position_map[component_id]

780

record_map[component_id] = method, content, digest, next

781

782

return record_map

783

784

def get_text(self, version_id):

785

"""See VersionedFile.get_text"""

786

return self.get_texts([version_id])[0]

787

788

def get_texts(self, version_ids):

789

return [''.join(l) for l in self.get_line_list(version_ids)]

790

791

def get_line_list(self, version_ids):

792

"""Return the texts of listed versions as a list of strings."""

793

for version_id in version_ids:

794

self.check_not_reserved_id(version_id)

795

text_map, content_map = self._get_content_maps(version_ids)

796

return [text_map[v] for v in version_ids]

797

798

def _get_content_maps(self, version_ids):

799

"""Produce maps of text and KnitContents

800

801

:return: (text_map, content_map) where text_map contains the texts for

802

the requested versions and content_map contains the KnitContents.

803

Both dicts take version_ids as their keys.

804

"""

805

for version_id in version_ids:

806

if not self.has_version(version_id):

807

raise RevisionNotPresent(version_id, self.filename)

808

record_map = self._get_record_map(version_ids)

809

810

text_map = {}

811

content_map = {}

812

final_content = {}

813

for version_id in version_ids:

814

components = []

815

cursor = version_id

816

while cursor is not None:

817

method, data, digest, next = record_map[cursor]

818

components.append((cursor, method, data, digest))

819

if cursor in content_map:

820

break

821

cursor = next

822

823

content = None

824

for component_id, method, data, digest in reversed(components):

825

if component_id in content_map:

826

content = content_map[component_id]

827

else:

828

version_idx = self._index.lookup(component_id)

829

if method == 'fulltext':

830

assert content is None

831

content = self.factory.parse_fulltext(data, version_idx)

832

elif method == 'line-delta':

833

delta = self.factory.parse_line_delta(data, version_idx)

834

content = content.copy()

835

content._lines = self._apply_delta(content._lines,

836

delta)

837

content_map[component_id] = content

838

839

if 'no-eol' in self._index.get_options(version_id):

840

content = content.copy()

841

line = content._lines[-1][1].rstrip('\n')

842

content._lines[-1] = (content._lines[-1][0], line)

843

final_content[version_id] = content

844

845

# digest here is the digest from the last applied component.

846

text = content.text()

847

if sha_strings(text) != digest:

848

raise KnitCorrupt(self.filename,

849

'sha-1 does not match %s' % version_id)

850

851

text_map[version_id] = text

852

return text_map, final_content

853

854

def iter_lines_added_or_present_in_versions(self, version_ids=None,

855

pb=None):

856

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

857

if version_ids is None:

858

version_ids = self.versions()

859

if pb is None:

860

pb = progress.DummyProgress()

861

# we don't care about inclusions, the caller cares.

862

# but we need to setup a list of records to visit.

863

# we need version_id, position, length

864

version_id_records = []

865

requested_versions = set(version_ids)

866

# filter for available versions

867

for version_id in requested_versions:

868

if not self.has_version(version_id):

869

raise RevisionNotPresent(version_id, self.filename)

870

# get a in-component-order queue:

871

for version_id in self.versions():

872

if version_id in requested_versions:

873

data_pos, length = self._index.get_position(version_id)

874

version_id_records.append((version_id, data_pos, length))

875

876

total = len(version_id_records)

877

for version_idx, (version_id, data, sha_value) in \

878

enumerate(self._data.read_records_iter(version_id_records)):

879

pb.update('Walking content.', version_idx, total)

880

method = self._index.get_method(version_id)

881

version_idx = self._index.lookup(version_id)

882

883

assert method in ('fulltext', 'line-delta')

884

if method == 'fulltext':

885

line_iterator = self.factory.get_fulltext_content(data)

886

else:

887

line_iterator = self.factory.get_linedelta_content(data)

888

for line in line_iterator:

889

yield line

890

891

pb.update('Walking content.', total, total)

892

893

def num_versions(self):

894

"""See VersionedFile.num_versions()."""

895

return self._index.num_versions()

896

897

__len__ = num_versions

898

899

def annotate_iter(self, version_id):

900

"""See VersionedFile.annotate_iter."""

901

content = self._get_content(version_id)

902

for origin, text in content.annotate_iter():

903

yield origin, text

904

905

def get_parents(self, version_id):

906

"""See VersionedFile.get_parents."""

907

# perf notes:

908

# optimism counts!

909

# 52554 calls in 1264 872 internal down from 3674

910

try:

911

return self._index.get_parents(version_id)

912

except KeyError:

913

raise RevisionNotPresent(version_id, self.filename)

914

915

def get_parents_with_ghosts(self, version_id):

916

"""See VersionedFile.get_parents."""

917

try:

918

return self._index.get_parents_with_ghosts(version_id)

919

except KeyError:

920

raise RevisionNotPresent(version_id, self.filename)

921

922

def get_ancestry(self, versions):

923

"""See VersionedFile.get_ancestry."""

924

if isinstance(versions, basestring):

925

versions = [versions]

926

if not versions:

927

return []

928

return self._index.get_ancestry(versions)

929

930

def get_ancestry_with_ghosts(self, versions):

931

"""See VersionedFile.get_ancestry_with_ghosts."""

932

if isinstance(versions, basestring):

933

versions = [versions]

934

if not versions:

935

return []

936

return self._index.get_ancestry_with_ghosts(versions)

937

938

#@deprecated_method(zero_eight)

939

def walk(self, version_ids):

940

"""See VersionedFile.walk."""

941

# We take the short path here, and extract all relevant texts

942

# and put them in a weave and let that do all the work. Far

943

# from optimal, but is much simpler.

944

# FIXME RB 20060228 this really is inefficient!

945

from bzrlib.weave import Weave

946

947

w = Weave(self.filename)

948

ancestry = self.get_ancestry(version_ids)

949

sorted_graph = topo_sort(self._index.get_graph())

950

version_list = [vid for vid in sorted_graph if vid in ancestry]

951

952

for version_id in version_list:

953

lines = self.get_lines(version_id)

954

w.add_lines(version_id, self.get_parents(version_id), lines)

955

956

for lineno, insert_id, dset, line in w.walk(version_ids):

957

yield lineno, insert_id, dset, line

958

959

def plan_merge(self, ver_a, ver_b):

960

"""See VersionedFile.plan_merge."""

961

ancestors_b = set(self.get_ancestry(ver_b))

962

def status_a(revision, text):

963

if revision in ancestors_b:

964

return 'killed-b', text

965

else:

966

return 'new-a', text

967

968

ancestors_a = set(self.get_ancestry(ver_a))

969

def status_b(revision, text):

970

if revision in ancestors_a:

971

return 'killed-a', text

972

else:

973

return 'new-b', text

974

975

annotated_a = self.annotate(ver_a)

976

annotated_b = self.annotate(ver_b)

977

plain_a = [t for (a, t) in annotated_a]

978

plain_b = [t for (a, t) in annotated_b]

979

blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

980

a_cur = 0

981

b_cur = 0

982

for ai, bi, l in blocks:

983

# process all mismatched sections

984

# (last mismatched section is handled because blocks always

985

# includes a 0-length last block)

986

for revision, text in annotated_a[a_cur:ai]:

987

yield status_a(revision, text)

988

for revision, text in annotated_b[b_cur:bi]:

989

yield status_b(revision, text)

990

991

# and now the matched section

992

a_cur = ai + l

993

b_cur = bi + l

994

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

995

assert text_a == text_b

996

yield "unchanged", text_a

997

998

999

class _KnitComponentFile(object):

1000

"""One of the files used to implement a knit database"""

1001

1002

def __init__(self, transport, filename, mode, file_mode=None,

1003

create_parent_dir=False, dir_mode=None):

1004

self._transport = transport

1005

self._filename = filename

1006

self._mode = mode

1007

self._file_mode = file_mode

1008

self._dir_mode = dir_mode

1009

self._create_parent_dir = create_parent_dir

1010

self._need_to_create = False

1011

1012

def _full_path(self):

1013

"""Return the full path to this file."""

1014

return self._transport.base + self._filename

1015

1016

def check_header(self, fp):

1017

line = fp.readline()

1018

if line == '':

1019

# An empty file can actually be treated as though the file doesn't

1020

# exist yet.

1021

raise errors.NoSuchFile(self._full_path())

1022

if line != self.HEADER:

1023

raise KnitHeaderError(badline=line,

1024

filename=self._transport.abspath(self._filename))

1025

1026

def commit(self):

1027

"""Commit is a nop."""

1028

1029

def __repr__(self):

1030

return '%s(%s)' % (self.__class__.__name__, self._filename)

1031

1032

1033

class _KnitIndex(_KnitComponentFile):

1034

"""Manages knit index file.

1035

1036

The index is already kept in memory and read on startup, to enable

1037

fast lookups of revision information. The cursor of the index

1038

file is always pointing to the end, making it easy to append

1039

entries.

1040

1041

_cache is a cache for fast mapping from version id to a Index

1042

object.

1043

1044

_history is a cache for fast mapping from indexes to version ids.

1045

1046

The index data format is dictionary compressed when it comes to

1047

parent references; a index entry may only have parents that with a

1048

lover index number. As a result, the index is topological sorted.

1049

1050

Duplicate entries may be written to the index for a single version id

1051

if this is done then the latter one completely replaces the former:

1052

this allows updates to correct version and parent information.

1053

Note that the two entries may share the delta, and that successive

1054

annotations and references MUST point to the first entry.

1055

1056

The index file on disc contains a header, followed by one line per knit

1057

record. The same revision can be present in an index file more than once.

1058

The first occurrence gets assigned a sequence number starting from 0.

1059

1060

The format of a single line is

1061

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

1062

REVISION_ID is a utf8-encoded revision id

1063

FLAGS is a comma separated list of flags about the record. Values include

1064

no-eol, line-delta, fulltext.

1065

BYTE_OFFSET is the ascii representation of the byte offset in the data file

1066

that the the compressed data starts at.

1067

LENGTH is the ascii representation of the length of the data file.

1068

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

1069

REVISION_ID.

1070

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

1071

revision id already in the knit that is a parent of REVISION_ID.

1072

The ' :' marker is the end of record marker.

1073

1074

partial writes:

1075

when a write is interrupted to the index file, it will result in a line

1076

that does not end in ' :'. If the ' :' is not present at the end of a line,

1077

or at the end of the file, then the record that is missing it will be

1078

ignored by the parser.

1079

1080

When writing new records to the index file, the data is preceded by '\n'

1081

to ensure that records always start on new lines even if the last write was

1082

interrupted. As a result its normal for the last line in the index to be

1083

missing a trailing newline. One can be added with no harmful effects.

1084

"""

1085

1086

HEADER = "# bzr knit index 8\n"

1087

1088

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

1089

# __slots__ = ['_cache', '_history', '_transport', '_filename']

1090

1091

def _cache_version(self, version_id, options, pos, size, parents):

1092

"""Cache a version record in the history array and index cache.

1093

1094

This is inlined into _load_data for performance. KEEP IN SYNC.

1095

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1096

indexes).

1097

"""

1098

# only want the _history index to reference the 1st index entry

1099

# for version_id

1100

if version_id not in self._cache:

1101

index = len(self._history)

1102

self._history.append(version_id)

1103

else:

1104

index = self._cache[version_id][5]

1105

self._cache[version_id] = (version_id,

1106

options,

1107

pos,

1108

size,

1109

parents,

1110

index)

1111

1112

def __init__(self, transport, filename, mode, create=False, file_mode=None,

1113

create_parent_dir=False, delay_create=False, dir_mode=None):

1114

_KnitComponentFile.__init__(self, transport, filename, mode,

1115

file_mode=file_mode,

1116

create_parent_dir=create_parent_dir,

1117

dir_mode=dir_mode)

1118

self._cache = {}

1119

# position in _history is the 'official' index for a revision

1120

# but the values may have come from a newer entry.

1121

# so - wc -l of a knit index is != the number of unique names

1122

# in the knit.

1123

self._history = []

1124

decode_utf8 = cache_utf8.decode

1125

pb = ui.ui_factory.nested_progress_bar()

1126

try:

1127

pb.update('read knit index', 0, 1)

1128

try:

1129

fp = self._transport.get(self._filename)

1130

try:

1131

# _load_data may raise NoSuchFile if the target knit is

1132

# completely empty.

1133

self._load_data(fp)

1134

finally:

1135

fp.close()

1136

except NoSuchFile:

1137

if mode != 'w' or not create:

1138

raise

1139

elif delay_create:

1140

self._need_to_create = True

1141

else:

1142

self._transport.put_bytes_non_atomic(

1143

self._filename, self.HEADER, mode=self._file_mode)

1144

finally:

1145

pb.update('read knit index', 1, 1)

1146

pb.finished()

1147

1148

def _load_data(self, fp):

1149

cache = self._cache

1150

history = self._history

1151

1152

self.check_header(fp)

1153

# readlines reads the whole file at once:

1154

# bad for transports like http, good for local disk

1155

# we save 60 ms doing this one change (

1156

# from calling readline each time to calling

1157

# readlines once.

1158

# probably what we want for nice behaviour on

1159

# http is a incremental readlines that yields, or

1160

# a check for local vs non local indexes,

1161

history_top = len(history) - 1

1162

for line in fp.readlines():

1163

rec = line.split()

1164

if len(rec) < 5 or rec[-1] != ':':

1165

# corrupt line.

1166

# FIXME: in the future we should determine if its a

1167

# short write - and ignore it

1168

# or a different failure, and raise. RBC 20060407

1169

continue

1170

1171

parents = []

1172

for value in rec[4:-1]:

1173

if value[0] == '.':

1174

# uncompressed reference

1175

parents.append(value[1:])

1176

else:

1177

parents.append(history[int(value)])

1178

1179

version_id, options, pos, size = rec[:4]

1180

1181

# See self._cache_version

1182

# only want the _history index to reference the 1st

1183

# index entry for version_id

1184

if version_id not in cache:

1185

history_top += 1

1186

index = history_top

1187

history.append(version_id)

1188

else:

1189

index = cache[version_id][5]

1190

cache[version_id] = (version_id,

1191

options.split(','),

1192

int(pos),

1193

int(size),

1194

parents,

1195

index)

1196

# end self._cache_version

1197

1198

def get_graph(self):

1199

return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]

1200

1201

def get_ancestry(self, versions):

1202

"""See VersionedFile.get_ancestry."""

1203

# get a graph of all the mentioned versions:

1204

graph = {}

1205

pending = set(versions)

1206

cache = self._cache

1207

while pending:

1208

version = pending.pop()

1209

# trim ghosts

1210

try:

1211

parents = [p for p in cache[version][4] if p in cache]

1212

except KeyError:

1213

raise RevisionNotPresent(version, self._filename)

1214

# if not completed and not a ghost

1215

pending.update([p for p in parents if p not in graph])

1216

graph[version] = parents

1217

return topo_sort(graph.items())

1218

1219

def get_ancestry_with_ghosts(self, versions):

1220

"""See VersionedFile.get_ancestry_with_ghosts."""

1221

# get a graph of all the mentioned versions:

1222

self.check_versions_present(versions)

1223

cache = self._cache

1224

graph = {}

1225

pending = set(versions)

1226

while pending:

1227

version = pending.pop()

1228

try:

1229

parents = cache[version][4]

1230

except KeyError:

1231

# ghost, fake it

1232

graph[version] = []

1233

else:

1234

# if not completed

1235

pending.update([p for p in parents if p not in graph])

1236

graph[version] = parents

1237

return topo_sort(graph.items())

1238

1239

def num_versions(self):

1240

return len(self._history)

1241

1242

__len__ = num_versions

1243

1244

def get_versions(self):

1245

return self._history

1246

1247

def idx_to_name(self, idx):

1248

return self._history[idx]

1249

1250

def lookup(self, version_id):

1251

assert version_id in self._cache

1252

return self._cache[version_id][5]

1253

1254

def _version_list_to_index(self, versions):

1255

encode_utf8 = cache_utf8.encode

1256

result_list = []

1257

cache = self._cache

1258

for version in versions:

1259

if version in cache:

1260

# -- inlined lookup() --

1261

result_list.append(str(cache[version][5]))

1262

# -- end lookup () --

1263

else:

1264

result_list.append('.' + encode_utf8(version))

1265

return ' '.join(result_list)

1266

1267

def add_version(self, version_id, options, pos, size, parents):

1268

"""Add a version record to the index."""

1269

self.add_versions(((version_id, options, pos, size, parents),))

1270

1271

def add_versions(self, versions):

1272

"""Add multiple versions to the index.

1273

1274

:param versions: a list of tuples:

1275

(version_id, options, pos, size, parents).

1276

"""

1277

lines = []

1278

encode_utf8 = cache_utf8.encode

1279

orig_history = self._history[:]

1280

orig_cache = self._cache.copy()

1281

1282

try:

1283

for version_id, options, pos, size, parents in versions:

1284

line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),

1285

','.join(options),

1286

pos,

1287

size,

1288

self._version_list_to_index(parents))

1289

assert isinstance(line, str), \

1290

'content must be utf-8 encoded: %r' % (line,)

1291

lines.append(line)

1292

self._cache_version(version_id, options, pos, size, parents)

1293

if not self._need_to_create:

1294

self._transport.append_bytes(self._filename, ''.join(lines))

1295

else:

1296

sio = StringIO()

1297

sio.write(self.HEADER)

1298

sio.writelines(lines)

1299

sio.seek(0)

1300

self._transport.put_file_non_atomic(self._filename, sio,

1301

create_parent_dir=self._create_parent_dir,

1302

mode=self._file_mode,

1303

dir_mode=self._dir_mode)

1304

self._need_to_create = False

1305

except:

1306

# If any problems happen, restore the original values and re-raise

1307

self._history = orig_history

1308

self._cache = orig_cache

1309

raise

1310

1311

def has_version(self, version_id):

1312

"""True if the version is in the index."""

1313

return version_id in self._cache

1314

1315

def get_position(self, version_id):

1316

"""Return data position and size of specified version."""

1317

entry = self._cache[version_id]

1318

return entry[2], entry[3]

1319

1320

def get_method(self, version_id):

1321

"""Return compression method of specified version."""

1322

options = self._cache[version_id][1]

1323

if 'fulltext' in options:

1324

return 'fulltext'

1325

else:

1326

if 'line-delta' not in options:

1327

raise errors.KnitIndexUnknownMethod(self._full_path(), options)

1328

return 'line-delta'

1329

1330

def get_options(self, version_id):

1331

return self._cache[version_id][1]

1332

1333

def get_parents(self, version_id):

1334

"""Return parents of specified version ignoring ghosts."""

1335

return [parent for parent in self._cache[version_id][4]

1336

if parent in self._cache]

1337

1338

def get_parents_with_ghosts(self, version_id):

1339

"""Return parents of specified version with ghosts."""

1340

return self._cache[version_id][4]

1341

1342

def check_versions_present(self, version_ids):

1343

"""Check that all specified versions are present."""

1344

cache = self._cache

1345

for version_id in version_ids:

1346

if version_id not in cache:

1347

raise RevisionNotPresent(version_id, self._filename)

1348

1349

1350

class _KnitData(_KnitComponentFile):

1351

"""Contents of the knit data file"""

1352

1353

def __init__(self, transport, filename, mode, create=False, file_mode=None,

1354

create_parent_dir=False, delay_create=False,

1355

dir_mode=None):

1356

_KnitComponentFile.__init__(self, transport, filename, mode,

1357

file_mode=file_mode,

1358

create_parent_dir=create_parent_dir,

1359

dir_mode=dir_mode)

1360

self._checked = False

1361

# TODO: jam 20060713 conceptually, this could spill to disk

1362

# if the cached size gets larger than a certain amount

1363

# but it complicates the model a bit, so for now just use

1364

# a simple dictionary

1365

self._cache = {}

1366

self._do_cache = False

1367

if create:

1368

if delay_create:

1369

self._need_to_create = create

1370

else:

1371

self._transport.put_bytes_non_atomic(self._filename, '',

1372

mode=self._file_mode)

1373

1374

def enable_cache(self):

1375

"""Enable caching of reads."""

1376

self._do_cache = True

1377

1378

def clear_cache(self):

1379

"""Clear the record cache."""

1380

self._do_cache = False

1381

self._cache = {}

1382

1383

def _open_file(self):

1384

try:

1385

return self._transport.get(self._filename)

1386

except NoSuchFile:

1387

pass

1388

return None

1389

1390

def _record_to_data(self, version_id, digest, lines):

1391

"""Convert version_id, digest, lines into a raw data block.

1392

1393

:return: (len, a StringIO instance with the raw data ready to read.)

1394

"""

1395

sio = StringIO()

1396

data_file = GzipFile(None, mode='wb', fileobj=sio)

1397

1398

version_id_utf8 = cache_utf8.encode(version_id)

1399

data_file.writelines(chain(

1400

["version %s %d %s\n" % (version_id_utf8,

1401

len(lines),

1402

digest)],

1403

lines,

1404

["end %s\n" % version_id_utf8]))

1405

data_file.close()

1406

length= sio.tell()

1407

1408

sio.seek(0)

1409

return length, sio

1410

1411

def add_raw_record(self, raw_data):

1412

"""Append a prepared record to the data file.

1413

1414

:return: the offset in the data file raw_data was written.

1415

"""

1416

assert isinstance(raw_data, str), 'data must be plain bytes'

1417

if not self._need_to_create:

1418

return self._transport.append_bytes(self._filename, raw_data)

1419

else:

1420

self._transport.put_bytes_non_atomic(self._filename, raw_data,

1421

create_parent_dir=self._create_parent_dir,

1422

mode=self._file_mode,

1423

dir_mode=self._dir_mode)

1424

self._need_to_create = False

1425

return 0

1426

1427

def add_record(self, version_id, digest, lines):

1428

"""Write new text record to disk. Returns the position in the

1429

file where it was written."""

1430

size, sio = self._record_to_data(version_id, digest, lines)

1431

# write to disk

1432

if not self._need_to_create:

1433

start_pos = self._transport.append_file(self._filename, sio)

1434

else:

1435

self._transport.put_file_non_atomic(self._filename, sio,

1436

create_parent_dir=self._create_parent_dir,

1437

mode=self._file_mode,

1438

dir_mode=self._dir_mode)

1439

self._need_to_create = False

1440

start_pos = 0

1441

if self._do_cache:

1442

self._cache[version_id] = sio.getvalue()

1443

return start_pos, size

1444

1445

def _parse_record_header(self, version_id, raw_data):

1446

"""Parse a record header for consistency.

1447

1448

:return: the header and the decompressor stream.

1449

as (stream, header_record)

1450

"""

1451

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1452

rec = self._check_header(version_id, df.readline())

1453

return df, rec

1454

1455

def _check_header(self, version_id, line):

1456

rec = line.split()

1457

if len(rec) != 4:

1458

raise KnitCorrupt(self._filename,

1459

'unexpected number of elements in record header')

1460

if cache_utf8.decode(rec[1]) != version_id:

1461

raise KnitCorrupt(self._filename,

1462

'unexpected version, wanted %r, got %r'

1463

% (version_id, rec[1]))

1464

return rec

1465

1466

def _parse_record(self, version_id, data):

1467

# profiling notes:

1468

# 4168 calls in 2880 217 internal

1469

# 4168 calls to _parse_record_header in 2121

1470

# 4168 calls to readlines in 330

1471

df = GzipFile(mode='rb', fileobj=StringIO(data))

1472

1473

record_contents = df.readlines()

1474

header = record_contents.pop(0)

1475

rec = self._check_header(version_id, header)

1476

1477

last_line = record_contents.pop()

1478

assert len(record_contents) == int(rec[2])

1479

if last_line != 'end %s\n' % rec[1]:

1480

raise KnitCorrupt(self._filename,

1481

'unexpected version end line %r, wanted %r'

1482

% (last_line, version_id))

1483

df.close()

1484

return record_contents, rec[3]

1485

1486

def read_records_iter_raw(self, records):

1487

"""Read text records from data file and yield raw data.

1488

1489

This unpacks enough of the text record to validate the id is

1490

as expected but thats all.

1491

"""

1492

# setup an iterator of the external records:

1493

# uses readv so nice and fast we hope.

1494

if len(records):

1495

# grab the disk data needed.

1496

if self._cache:

1497

# Don't check _cache if it is empty

1498

needed_offsets = [(pos, size) for version_id, pos, size

1499

in records

1500

if version_id not in self._cache]

1501

else:

1502

needed_offsets = [(pos, size) for version_id, pos, size

1503

in records]

1504

1505

raw_records = self._transport.readv(self._filename, needed_offsets)

1506

1507

for version_id, pos, size in records:

1508

if version_id in self._cache:

1509

# This data has already been validated

1510

data = self._cache[version_id]

1511

else:

1512

pos, data = raw_records.next()

1513

if self._do_cache:

1514

self._cache[version_id] = data

1515

1516

# validate the header

1517

df, rec = self._parse_record_header(version_id, data)

1518

df.close()

1519

yield version_id, data

1520

1521

def read_records_iter(self, records):

1522

"""Read text records from data file and yield result.

1523

1524

The result will be returned in whatever is the fastest to read.

1525

Not by the order requested. Also, multiple requests for the same

1526

record will only yield 1 response.

1527

:param records: A list of (version_id, pos, len) entries

1528

:return: Yields (version_id, contents, digest) in the order

1529

read, not the order requested

1530

"""

1531

if not records:

1532

return

1533

1534

if self._cache:

1535

# Skip records we have alread seen

1536

yielded_records = set()

1537

needed_records = set()

1538

for record in records:

1539

if record[0] in self._cache:

1540

if record[0] in yielded_records:

1541

continue

1542

yielded_records.add(record[0])

1543

data = self._cache[record[0]]

1544

content, digest = self._parse_record(record[0], data)

1545

yield (record[0], content, digest)

1546

else:

1547

needed_records.add(record)

1548

needed_records = sorted(needed_records, key=operator.itemgetter(1))

1549

else:

1550

needed_records = sorted(set(records), key=operator.itemgetter(1))

1551

1552

if not needed_records:

1553

return

1554

1555

# The transport optimizes the fetching as well

1556

# (ie, reads continuous ranges.)

1557

readv_response = self._transport.readv(self._filename,

1558

[(pos, size) for version_id, pos, size in needed_records])

1559

1560

for (version_id, pos, size), (pos, data) in \

1561

izip(iter(needed_records), readv_response):

1562

content, digest = self._parse_record(version_id, data)

1563

if self._do_cache:

1564

self._cache[version_id] = data

1565

yield version_id, content, digest

1566

1567

def read_records(self, records):

1568

"""Read records into a dictionary."""

1569

components = {}

1570

for record_id, content, digest in \

1571

self.read_records_iter(records):

1572

components[record_id] = (content, digest)

1573

return components

1574

1575

1576

class InterKnit(InterVersionedFile):

1577

"""Optimised code paths for knit to knit operations."""

1578

1579

_matching_file_from_factory = KnitVersionedFile

1580

_matching_file_to_factory = KnitVersionedFile

1581

1582

@staticmethod

1583

def is_compatible(source, target):

1584

"""Be compatible with knits. """

1585

try:

1586

return (isinstance(source, KnitVersionedFile) and

1587

isinstance(target, KnitVersionedFile))

1588

except AttributeError:

1589

return False

1590

1591

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1592

"""See InterVersionedFile.join."""

1593

assert isinstance(self.source, KnitVersionedFile)

1594

assert isinstance(self.target, KnitVersionedFile)

1595

1596

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1597

1598

if not version_ids:

1599

return 0

1600

1601

pb = ui.ui_factory.nested_progress_bar()

1602

try:

1603

version_ids = list(version_ids)

1604

if None in version_ids:

1605

version_ids.remove(None)

1606

1607

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1608

this_versions = set(self.target._index.get_versions())

1609

needed_versions = self.source_ancestry - this_versions

1610

cross_check_versions = self.source_ancestry.intersection(this_versions)

1611

mismatched_versions = set()

1612

for version in cross_check_versions:

1613

# scan to include needed parents.

1614

n1 = set(self.target.get_parents_with_ghosts(version))

1615

n2 = set(self.source.get_parents_with_ghosts(version))

1616

if n1 != n2:

1617

# FIXME TEST this check for cycles being introduced works

1618

# the logic is we have a cycle if in our graph we are an

1619

# ancestor of any of the n2 revisions.

1620

for parent in n2:

1621

if parent in n1:

1622

# safe

1623

continue

1624

else:

1625

parent_ancestors = self.source.get_ancestry(parent)

1626

if version in parent_ancestors:

1627

raise errors.GraphCycleError([parent, version])

1628

# ensure this parent will be available later.

1629

new_parents = n2.difference(n1)

1630

needed_versions.update(new_parents.difference(this_versions))

1631

mismatched_versions.add(version)

1632

1633

if not needed_versions and not mismatched_versions:

1634

return 0

1635

full_list = topo_sort(self.source.get_graph())

1636

1637

version_list = [i for i in full_list if (not self.target.has_version(i)

1638

and i in needed_versions)]

1639

1640

# plan the join:

1641

copy_queue = []

1642

copy_queue_records = []

1643

copy_set = set()

1644

for version_id in version_list:

1645

options = self.source._index.get_options(version_id)

1646

parents = self.source._index.get_parents_with_ghosts(version_id)

1647

# check that its will be a consistent copy:

1648

for parent in parents:

1649

# if source has the parent, we must :

1650

# * already have it or

1651

# * have it scheduled already

1652

# otherwise we don't care

1653

assert (self.target.has_version(parent) or

1654

parent in copy_set or

1655

not self.source.has_version(parent))

1656

data_pos, data_size = self.source._index.get_position(version_id)

1657

copy_queue_records.append((version_id, data_pos, data_size))

1658

copy_queue.append((version_id, options, parents))

1659

copy_set.add(version_id)

1660

1661

# data suck the join:

1662

count = 0

1663

total = len(version_list)

1664

raw_datum = []

1665

raw_records = []

1666

for (version_id, raw_data), \

1667

(version_id2, options, parents) in \

1668

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1669

copy_queue):

1670

assert version_id == version_id2, 'logic error, inconsistent results'

1671

count = count + 1

1672

pb.update("Joining knit", count, total)

1673

raw_records.append((version_id, options, parents, len(raw_data)))

1674

raw_datum.append(raw_data)

1675

self.target._add_raw_records(raw_records, ''.join(raw_datum))

1676

1677

for version in mismatched_versions:

1678

# FIXME RBC 20060309 is this needed?

1679

n1 = set(self.target.get_parents_with_ghosts(version))

1680

n2 = set(self.source.get_parents_with_ghosts(version))

1681

# write a combined record to our history preserving the current

1682

# parents as first in the list

1683

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1684

self.target.fix_parents(version, new_parents)

1685

return count

1686

finally:

1687

pb.finished()

1688

1689

1690

InterVersionedFile.register_optimiser(InterKnit)

1691

1692

1693

class WeaveToKnit(InterVersionedFile):

1694

"""Optimised code paths for weave to knit operations."""

1695

1696

_matching_file_from_factory = bzrlib.weave.WeaveFile

1697

_matching_file_to_factory = KnitVersionedFile

1698

1699

@staticmethod

1700

def is_compatible(source, target):

1701

"""Be compatible with weaves to knits."""

1702

try:

1703

return (isinstance(source, bzrlib.weave.Weave) and

1704

isinstance(target, KnitVersionedFile))

1705

except AttributeError:

1706

return False

1707

1708

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1709

"""See InterVersionedFile.join."""

1710

assert isinstance(self.source, bzrlib.weave.Weave)

1711

assert isinstance(self.target, KnitVersionedFile)

1712

1713

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1714

1715

if not version_ids:

1716

return 0

1717

1718

pb = ui.ui_factory.nested_progress_bar()

1719

try:

1720

version_ids = list(version_ids)

1721

1722

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1723

this_versions = set(self.target._index.get_versions())

1724

needed_versions = self.source_ancestry - this_versions

1725

cross_check_versions = self.source_ancestry.intersection(this_versions)

1726

mismatched_versions = set()

1727

for version in cross_check_versions:

1728

# scan to include needed parents.

1729

n1 = set(self.target.get_parents_with_ghosts(version))

1730

n2 = set(self.source.get_parents(version))

1731

# if all of n2's parents are in n1, then its fine.

1732

if n2.difference(n1):

1733

# FIXME TEST this check for cycles being introduced works

1734

# the logic is we have a cycle if in our graph we are an

1735

# ancestor of any of the n2 revisions.

1736

for parent in n2:

1737

if parent in n1:

1738

# safe

1739

continue

1740

else:

1741

parent_ancestors = self.source.get_ancestry(parent)

1742

if version in parent_ancestors:

1743

raise errors.GraphCycleError([parent, version])

1744

# ensure this parent will be available later.

1745

new_parents = n2.difference(n1)

1746

needed_versions.update(new_parents.difference(this_versions))

1747

mismatched_versions.add(version)

1748

1749

if not needed_versions and not mismatched_versions:

1750

return 0

1751

full_list = topo_sort(self.source.get_graph())

1752

1753

version_list = [i for i in full_list if (not self.target.has_version(i)

1754

and i in needed_versions)]

1755

1756

# do the join:

1757

count = 0

1758

total = len(version_list)

1759

for version_id in version_list:

1760

pb.update("Converting to knit", count, total)

1761

parents = self.source.get_parents(version_id)

1762

# check that its will be a consistent copy:

1763

for parent in parents:

1764

# if source has the parent, we must already have it

1765

assert (self.target.has_version(parent))

1766

self.target.add_lines(

1767

version_id, parents, self.source.get_lines(version_id))

1768

count = count + 1

1769

1770

for version in mismatched_versions:

1771

# FIXME RBC 20060309 is this needed?

1772

n1 = set(self.target.get_parents_with_ghosts(version))

1773

n2 = set(self.source.get_parents(version))

1774

# write a combined record to our history preserving the current

1775

# parents as first in the list

1776

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1777

self.target.fix_parents(version, new_parents)

1778

return count

1779

finally:

1780

pb.finished()

1781

1782

1783

InterVersionedFile.register_optimiser(WeaveToKnit)

1784

1785

1786

class KnitSequenceMatcher(difflib.SequenceMatcher):

1787

"""Knit tuned sequence matcher.

1788

1789

This is based on profiling of difflib which indicated some improvements

1790

for our usage pattern.

1791

"""

1792

1793

def find_longest_match(self, alo, ahi, blo, bhi):

1794

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1795

1796

If isjunk is not defined:

1797

1798

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1799

alo <= i <= i+k <= ahi

1800

blo <= j <= j+k <= bhi

1801

and for all (i',j',k') meeting those conditions,

1802

k >= k'

1803

i <= i'

1804

and if i == i', j <= j'

1805

1806

In other words, of all maximal matching blocks, return one that

1807

starts earliest in a, and of all those maximal matching blocks that

1808

start earliest in a, return the one that starts earliest in b.

1809

1810

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1811

>>> s.find_longest_match(0, 5, 0, 9)

1812

(0, 4, 5)

1813

1814

If isjunk is defined, first the longest matching block is

1815

determined as above, but with the additional restriction that no

1816

junk element appears in the block. Then that block is extended as

1817

far as possible by matching (only) junk elements on both sides. So

1818

the resulting block never matches on junk except as identical junk

1819

happens to be adjacent to an "interesting" match.

1820

1821

Here's the same example as before, but considering blanks to be

1822

junk. That prevents " abcd" from matching the " abcd" at the tail

1823

end of the second sequence directly. Instead only the "abcd" can

1824

match, and matches the leftmost "abcd" in the second sequence:

1825

1826

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1827

>>> s.find_longest_match(0, 5, 0, 9)

1828

(1, 0, 4)

1829

1830

If no blocks match, return (alo, blo, 0).

1831

1832

>>> s = SequenceMatcher(None, "ab", "c")

1833

>>> s.find_longest_match(0, 2, 0, 1)

1834

(0, 0, 0)

1835

"""

1836

1837

# CAUTION: stripping common prefix or suffix would be incorrect.

1838

# E.g.,

1839

# ab

1840

# acab

1841

# Longest matching block is "ab", but if common prefix is

1842

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1843

# strip, so ends up claiming that ab is changed to acab by

1844

# inserting "ca" in the middle. That's minimal but unintuitive:

1845

# "it's obvious" that someone inserted "ac" at the front.

1846

# Windiff ends up at the same place as diff, but by pairing up

1847

# the unique 'b's and then matching the first two 'a's.

1848

1849

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1850

besti, bestj, bestsize = alo, blo, 0

1851

# find longest junk-free match

1852

# during an iteration of the loop, j2len[j] = length of longest

1853

# junk-free match ending with a[i-1] and b[j]

1854

j2len = {}

1855

# nothing = []

1856

b2jget = b2j.get

1857

for i in xrange(alo, ahi):

1858

# look at all instances of a[i] in b; note that because

1859

# b2j has no junk keys, the loop is skipped if a[i] is junk

1860

j2lenget = j2len.get

1861

newj2len = {}

1862

1863

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

1864

# following improvement

1865

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1866

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1867

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1868

# to

1869

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1870

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1871

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1872

1873

try:

1874

js = b2j[a[i]]

1875

except KeyError:

1876

pass

1877

else:

1878

for j in js:

1879

# a[i] matches b[j]

1880

if j >= blo:

1881

if j >= bhi:

1882

break

1883

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1884

if k > bestsize:

1885

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1886

j2len = newj2len

1887

1888

# Extend the best by non-junk elements on each end. In particular,

1889

# "popular" non-junk elements aren't in b2j, which greatly speeds

1890

# the inner loop above, but also means "the best" match so far

1891

# doesn't contain any junk *or* popular non-junk elements.

1892

while besti > alo and bestj > blo and \

1893

not isbjunk(b[bestj-1]) and \

1894

a[besti-1] == b[bestj-1]:

1895

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1896

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1897

not isbjunk(b[bestj+bestsize]) and \

1898

a[besti+bestsize] == b[bestj+bestsize]:

1899

bestsize += 1

1900

1901

# Now that we have a wholly interesting match (albeit possibly

1902

# empty!), we may as well suck up the matching junk on each

1903

# side of it too. Can't think of a good reason not to, and it

1904

# saves post-processing the (possibly considerable) expense of

1905

# figuring out what to do with it. In the case of an empty

1906

# interesting match, this is clearly the right thing to do,

1907

# because no other kind of match is possible in the regions.

1908

while besti > alo and bestj > blo and \

1909

isbjunk(b[bestj-1]) and \

1910

a[besti-1] == b[bestj-1]:

1911

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1912

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1913

isbjunk(b[bestj+bestsize]) and \

1914

a[besti+bestsize] == b[bestj+bestsize]:

1915

bestsize = bestsize + 1

1916

1917

return besti, bestj, bestsize

Older »