~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Jelmer Vernooij
Date: 2006-06-21 13:54:14 UTC
mto: (1558.14.8 Aaron's integration)
mto: This revision was merged to the branch mainline in revision 1803.
Revision ID: jelmer@samba.org-20060621135414-11a3a70e53adbb99

Install benchmarks.

files added:
BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS.developers

bzrlib/annotate.py

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/builtins.py

bzrlib/bundle

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/read_bundle.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v07.py

bzrlib/bzrdir.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/identitymap.py

bzrlib/inter.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/lsprof.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugins/__init__.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/store

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_bundle.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_emptytree.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_permissions.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textfile.py

bzrlib/textmerge.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/urlutils.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/versionedfile.py

bzrlib/weave_commands.py

bzrlib/win32console.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib/emacs

contrib/emacs/bzr-mode.el

doc/README.1st

doc/configuration.txt

doc/plugins.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/tutorial.txt

doc/using_aliases.txt

generate_docs.py

profile_imports.py

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_rstx.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

files removed:
bzrlib/mdiff.py

bzrlib/merge_core.py

bzrlib/meta_store.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/upgrade.py

doc/Makefile

doc/adoption.txt

doc/bitkeeper.txt

doc/changelogs.txt

doc/cherry-picking.txt

doc/cmdref.txt

doc/common-format.txt

doc/compared-aegis.txt

doc/compared-codeville.txt

doc/compared-cvsnt.txt

doc/compared-opencm.txt

doc/compared-prcs.txt

doc/compared-teamware.txt

doc/compression.txt

doc/config-specs.txt

doc/conflicts.txt

doc/costs.txt

doc/darcs.txt

doc/deadly-sins.txt

doc/default.css

doc/design.txt

doc/extra-commands.txt

doc/formats.txt

doc/hashes.txt

doc/ignore.txt

doc/index.txt

doc/interrupted.txt

doc/intro.txt

doc/inventory.txt

doc/join-branches.txt

doc/kill-version.txt

doc/layers.txt

doc/library-interface.txt

doc/merge.txt

doc/mirroring.txt

doc/monotone.txt

doc/news.txt

doc/optional-edit.txt

doc/partial-commit.txt

doc/pool.txt

doc/purpose.txt

doc/python.txt

doc/quilt.txt

doc/quotes.txt

doc/random.txt

doc/requirements.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/revision-syntax.txt

doc/rollup.txt

doc/scalability.txt

doc/security.txt

doc/shared-branches.txt

doc/short-demo.txt

doc/supportability.txt

doc/svk.txt

doc/switch-in-branch.txt

doc/tagging.txt

doc/taxonomy.txt

doc/thanks.txt

doc/todo-from-arch.txt

doc/unchanged.txt

doc/unrelated-merge.txt

doc/usability.txt

doc/use-cases.txt

doc/web-interface.txt

doc/workflow.txt

doc/yaml.txt

notes

notes/new-inventory-sample.xml

notes/performance.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/ndiff.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

plugins/changeset

plugins/changeset/__init__.py

plugins/changeset/apply_changeset.py

plugins/changeset/common.py

plugins/changeset/gen_changeset.py

plugins/changeset/read_changeset.py

plugins/checkperms

testbzr

testsweet.py

files renamed:
bzrlib/changeset.py => bzrlib/bundle/__init__.py

plugins/ => bzrlib/plugins/

bzrlib/store.py => bzrlib/store/__init__.py

bzrlib/selftest/ => bzrlib/tests/

bzrlib/selftest/teststatus.py => bzrlib/tests/blackbox/test_status.py

bzrlib/selftest/blackbox.py => bzrlib/tests/blackbox/test_too_much.py

bzrlib/selftest/versioning.py => bzrlib/tests/blackbox/test_versioning.py

bzrlib/selftest/testbranch.py => bzrlib/tests/branch_implementations/test_branch.py

bzrlib/selftest/testhashcache.py => bzrlib/tests/test_hashcache.py

bzrlib/selftest/testinv.py => bzrlib/tests/test_inv.py

bzrlib/selftest/testmerge3.py => bzrlib/tests/test_merge3.py

bzrlib/selftest/plugins.py => bzrlib/tests/test_plugins.py

bzrlib/selftest/testrevisionnamespaces.py => bzrlib/tests/test_revisionnamespaces.py

tools/testweave.py => bzrlib/tests/test_weave.py

bzrlib/selftest/whitebox.py => bzrlib/tests/test_whitebox.py

effbot/ => bzrlib/util/effbot/

elementtree/ => bzrlib/util/elementtree/

urlgrabber/ => bzrlib/util/urlgrabber/

bzrlib/xml.py => bzrlib/xml_serializer.py

bzrlib/newinventory.py => contrib/newinventory.py

bzr-man.py => tools/doc_generate/autodoc_man.py

files modified:
.bzrignore

.rsyncexclude

NEWS

README

TODO

build-api

bzr *

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/tests/__init__.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/weave.py *

bzrlib/weavefile.py

bzrlib/workingtree.py

contrib/pwk

contrib/zsh/_bzr

setup.py *

tools/weavebench.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

import sys

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import GzipFile

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.tsort import topo_sort

import bzrlib.weave

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accommodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

# TODO: atomically append data, then measure backwards from the cursor

# position after writing to work out where it was located. we may need to

100

# bypass python file buffering.

101

102

DATA_SUFFIX = '.knit'

103

INDEX_SUFFIX = '.kndx'

104

105

106

class KnitContent(object):

107

"""Content of a knit version to which deltas can be applied."""

108

109

def __init__(self, lines):

110

self._lines = lines

111

112

def annotate_iter(self):

113

"""Yield tuples of (origin, text) for each content line."""

114

for origin, text in self._lines:

115

yield origin, text

116

117

def annotate(self):

118

"""Return a list of (origin, text) tuples."""

119

return list(self.annotate_iter())

120

121

def line_delta_iter(self, new_lines):

122

"""Generate line-based delta from this content to new_lines."""

123

new_texts = [text for origin, text in new_lines._lines]

124

old_texts = [text for origin, text in self._lines]

125

s = KnitSequenceMatcher(None, old_texts, new_texts)

126

for op in s.get_opcodes():

127

if op[0] == 'equal':

128

continue

129

# ofrom oto length data

130

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

131

132

def line_delta(self, new_lines):

133

return list(self.line_delta_iter(new_lines))

134

135

def text(self):

136

return [text for origin, text in self._lines]

137

138

139

class _KnitFactory(object):

140

"""Base factory for creating content objects."""

141

142

def make(self, lines, version):

143

num_lines = len(lines)

144

return KnitContent(zip([version] * num_lines, lines))

145

146

147

class KnitAnnotateFactory(_KnitFactory):

148

"""Factory for creating annotated Content objects."""

149

150

annotated = True

151

152

def parse_fulltext(self, content, version):

153

"""Convert fulltext to internal representation

154

155

fulltext content is of the format

156

revid(utf8) plaintext\n

157

internal representation is of the format:

158

(revid, plaintext)

159

"""

160

lines = []

161

for line in content:

162

origin, text = line.split(' ', 1)

163

lines.append((origin.decode('utf-8'), text))

164

return KnitContent(lines)

165

166

def parse_line_delta_iter(self, lines):

167

for result_item in self.parse_line_delta[lines]:

168

yield result_item

169

170

def parse_line_delta(self, lines, version):

171

"""Convert a line based delta into internal representation.

172

173

line delta is in the form of:

174

intstart intend intcount

175

1..count lines:

176

revid(utf8) newline\n

177

internal representation is

178

(start, end, count, [1..count tuples (revid, newline)])

179

"""

180

result = []

181

lines = iter(lines)

182

next = lines.next

183

# walk through the lines parsing.

184

for header in lines:

185

start, end, count = [int(n) for n in header.split(',')]

186

contents = []

187

remaining = count

188

while remaining:

189

origin, text = next().split(' ', 1)

190

remaining -= 1

191

contents.append((origin.decode('utf-8'), text))

192

result.append((start, end, count, contents))

193

return result

194

195

def lower_fulltext(self, content):

196

"""convert a fulltext content record into a serializable form.

197

198

see parse_fulltext which this inverts.

199

"""

200

return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]

201

202

def lower_line_delta(self, delta):

203

"""convert a delta into a serializable form.

204

205

See parse_line_delta which this inverts.

206

"""

207

out = []

208

for start, end, c, lines in delta:

209

out.append('%d,%d,%d\n' % (start, end, c))

210

for origin, text in lines:

211

out.append('%s %s' % (origin.encode('utf-8'), text))

212

return out

213

214

215

class KnitPlainFactory(_KnitFactory):

216

"""Factory for creating plain Content objects."""

217

218

annotated = False

219

220

def parse_fulltext(self, content, version):

221

"""This parses an unannotated fulltext.

222

223

Note that this is not a noop - the internal representation

224

has (versionid, line) - its just a constant versionid.

225

"""

226

return self.make(content, version)

227

228

def parse_line_delta_iter(self, lines, version):

229

while lines:

230

header = lines.pop(0)

231

start, end, c = [int(n) for n in header.split(',')]

232

yield start, end, c, zip([version] * c, lines[:c])

233

del lines[:c]

234

235

def parse_line_delta(self, lines, version):

236

return list(self.parse_line_delta_iter(lines, version))

237

238

def lower_fulltext(self, content):

239

return content.text()

240

241

def lower_line_delta(self, delta):

242

out = []

243

for start, end, c, lines in delta:

244

out.append('%d,%d,%d\n' % (start, end, c))

245

out.extend([text for origin, text in lines])

246

return out

247

248

249

def make_empty_knit(transport, relpath):

250

"""Construct a empty knit at the specified location."""

251

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

252

k._data._open_file()

253

254

255

class KnitVersionedFile(VersionedFile):

256

"""Weave-like structure with faster random access.

257

258

A knit stores a number of texts and a summary of the relationships

259

between them. Texts are identified by a string version-id. Texts

260

are normally stored and retrieved as a series of lines, but can

261

also be passed as single strings.

262

263

Lines are stored with the trailing newline (if any) included, to

264

avoid special cases for files with no final newline. Lines are

265

composed of 8-bit characters, not unicode. The combination of

266

these approaches should mean any 'binary' file can be safely

267

stored and retrieved.

268

"""

269

270

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

271

factory=None, basis_knit=None, delta=True, create=False):

272

"""Construct a knit at location specified by relpath.

273

274

:param create: If not True, only open an existing knit.

275

"""

276

if access_mode is None:

277

access_mode = 'w'

278

super(KnitVersionedFile, self).__init__(access_mode)

279

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

280

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

281

type(basis_knit)

282

283

self.transport = transport

284

self.filename = relpath

285

self.basis_knit = basis_knit

286

self.factory = factory or KnitAnnotateFactory()

287

self.writable = (access_mode == 'w')

288

self.delta = delta

289

290

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

291

access_mode, create=create, file_mode=file_mode)

292

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

293

access_mode, create=create and not len(self), file_mode=file_mode)

294

295

def __repr__(self):

296

return '%s(%s)' % (self.__class__.__name__,

297

self.transport.abspath(self.filename))

298

299

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

300

"""See VersionedFile._add_delta()."""

301

self._check_add(version_id, []) # should we check the lines ?

302

self._check_versions_present(parents)

303

present_parents = []

304

ghosts = []

305

parent_texts = {}

306

for parent in parents:

307

if not self.has_version(parent):

308

ghosts.append(parent)

309

else:

310

present_parents.append(parent)

311

312

if delta_parent is None:

313

# reconstitute as full text.

314

assert len(delta) == 1 or len(delta) == 0

315

if len(delta):

316

assert delta[0][0] == 0

317

assert delta[0][1] == 0, delta[0][1]

318

return super(KnitVersionedFile, self)._add_delta(version_id,

319

parents,

320

delta_parent,

321

sha1,

322

noeol,

323

delta)

324

325

digest = sha1

326

327

options = []

328

if noeol:

329

options.append('no-eol')

330

331

if delta_parent is not None:

332

# determine the current delta chain length.

333

# To speed the extract of texts the delta chain is limited

334

# to a fixed number of deltas. This should minimize both

335

# I/O and the time spend applying deltas.

336

count = 0

337

delta_parents = [delta_parent]

338

while count < 25:

339

parent = delta_parents[0]

340

method = self._index.get_method(parent)

341

if method == 'fulltext':

342

break

343

delta_parents = self._index.get_parents(parent)

344

count = count + 1

345

if method == 'line-delta':

346

# did not find a fulltext in the delta limit.

347

# just do a normal insertion.

348

return super(KnitVersionedFile, self)._add_delta(version_id,

349

parents,

350

delta_parent,

351

sha1,

352

noeol,

353

delta)

354

355

options.append('line-delta')

356

store_lines = self.factory.lower_line_delta(delta)

357

358

where, size = self._data.add_record(version_id, digest, store_lines)

359

self._index.add_version(version_id, options, where, size, parents)

360

361

def _add_raw_records(self, records, data):

362

"""Add all the records 'records' with data pre-joined in 'data'.

363

364

:param records: A list of tuples(version_id, options, parents, size).

365

:param data: The data for the records. When it is written, the records

366

are adjusted to have pos pointing into data by the sum of

367

the preceding records sizes.

368

"""

369

# write all the data

370

pos = self._data.add_raw_record(data)

371

index_entries = []

372

for (version_id, options, parents, size) in records:

373

index_entries.append((version_id, options, pos, size, parents))

374

pos += size

375

self._index.add_versions(index_entries)

376

377

def clear_cache(self):

378

"""Clear the data cache only."""

379

self._data.clear_cache()

380

381

def copy_to(self, name, transport):

382

"""See VersionedFile.copy_to()."""

383

# copy the current index to a temp index to avoid racing with local

384

# writes

385

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)

386

# copy the data file

387

transport.put(name + DATA_SUFFIX, self._data._open_file())

388

# rename the copied index into place

389

transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

390

391

def create_empty(self, name, transport, mode=None):

392

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

393

394

def _fix_parents(self, version, new_parents):

395

"""Fix the parents list for version.

396

397

This is done by appending a new version to the index

398

with identical data except for the parents list.

399

the parents list must be a superset of the current

400

list.

401

"""

402

current_values = self._index._cache[version]

403

assert set(current_values[4]).difference(set(new_parents)) == set()

404

self._index.add_version(version,

405

current_values[1],

406

current_values[2],

407

current_values[3],

408

new_parents)

409

410

def get_delta(self, version_id):

411

"""Get a delta for constructing version from some other version."""

412

if not self.has_version(version_id):

413

raise RevisionNotPresent(version_id, self.filename)

414

415

parents = self.get_parents(version_id)

416

if len(parents):

417

parent = parents[0]

418

else:

419

parent = None

420

data_pos, data_size = self._index.get_position(version_id)

421

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

422

version_idx = self._index.lookup(version_id)

423

noeol = 'no-eol' in self._index.get_options(version_id)

424

if 'fulltext' == self._index.get_method(version_id):

425

new_content = self.factory.parse_fulltext(data, version_idx)

426

if parent is not None:

427

reference_content = self._get_content(parent)

428

old_texts = reference_content.text()

429

else:

430

old_texts = []

431

new_texts = new_content.text()

432

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

433

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

434

else:

435

delta = self.factory.parse_line_delta(data, version_idx)

436

return parent, sha1, noeol, delta

437

438

def get_graph_with_ghosts(self):

439

"""See VersionedFile.get_graph_with_ghosts()."""

440

graph_items = self._index.get_graph()

441

return dict(graph_items)

442

443

def get_sha1(self, version_id):

444

"""See VersionedFile.get_sha1()."""

445

components = self._get_components(version_id)

446

return components[-1][-1][-1]

447

448

@staticmethod

449

def get_suffixes():

450

"""See VersionedFile.get_suffixes()."""

451

return [DATA_SUFFIX, INDEX_SUFFIX]

452

453

def has_ghost(self, version_id):

454

"""True if there is a ghost reference in the file to version_id."""

455

# maybe we have it

456

if self.has_version(version_id):

457

return False

458

# optimisable if needed by memoising the _ghosts set.

459

items = self._index.get_graph()

460

for node, parents in items:

461

for parent in parents:

462

if parent not in self._index._cache:

463

if parent == version_id:

464

return True

465

return False

466

467

def versions(self):

468

"""See VersionedFile.versions."""

469

return self._index.get_versions()

470

471

def has_version(self, version_id):

472

"""See VersionedFile.has_version."""

473

return self._index.has_version(version_id)

474

475

__contains__ = has_version

476

477

def _merge_annotations(self, content, parents, parent_texts={},

478

delta=None, annotated=None):

479

"""Merge annotations for content. This is done by comparing

480

the annotations based on changed to the text.

481

"""

482

if annotated:

483

delta_seq = None

484

for parent_id in parents:

485

merge_content = self._get_content(parent_id, parent_texts)

486

seq = KnitSequenceMatcher(None, merge_content.text(), content.text())

487

if delta_seq is None:

488

# setup a delta seq to reuse.

489

delta_seq = seq

490

for i, j, n in seq.get_matching_blocks():

491

if n == 0:

492

continue

493

# this appears to copy (origin, text) pairs across to the new

494

# content for any line that matches the last-checked parent.

495

# FIXME: save the sequence control data for delta compression

496

# against the most relevant parent rather than rediffing.

497

content._lines[j:j+n] = merge_content._lines[i:i+n]

498

if delta:

499

if not annotated:

500

reference_content = self._get_content(parents[0], parent_texts)

501

new_texts = content.text()

502

old_texts = reference_content.text()

503

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

504

return self._make_line_delta(delta_seq, content)

505

506

def _make_line_delta(self, delta_seq, new_content):

507

"""Generate a line delta from delta_seq and new_content."""

508

diff_hunks = []

509

for op in delta_seq.get_opcodes():

510

if op[0] == 'equal':

511

continue

512

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

513

return diff_hunks

514

515

def _get_component_versions(self, version_id):

516

basis = self.basis_knit

517

needed_versions = []

518

basis_versions = []

519

cursor = version_id

520

521

while 1:

522

picked_knit = self

523

if basis and basis._index.has_version(cursor):

524

picked_knit = basis

525

basis_versions.append(cursor)

526

method = picked_knit._index.get_method(cursor)

527

needed_versions.append((method, cursor))

528

if method == 'fulltext':

529

break

530

cursor = picked_knit.get_parents(cursor)[0]

531

return needed_versions, basis_versions

532

533

def _get_component_positions(self, version_id):

534

needed_versions, basis_versions = \

535

self._get_component_versions(version_id)

536

assert len(basis_versions) == 0

537

positions = []

538

for method, comp_id in needed_versions:

539

data_pos, data_size = self._index.get_position(comp_id)

540

positions.append((method, comp_id, data_pos, data_size))

541

return positions

542

543

def _get_components(self, version_id):

544

"""Return a list of (version_id, method, data) tuples that

545

makes up version specified by version_id of the knit.

546

547

The components should be applied in the order of the returned

548

list.

549

550

The basis knit will be used to the largest extent possible

551

since it is assumed that accesses to it is faster.

552

"""

553

#profile notes:

554

# 4168 calls in 14912, 2289 internal

555

# 4168 in 9711 to read_records

556

# 52554 in 1250 to get_parents

557

# 170166 in 865 to list.append

558

559

# needed_revisions holds a list of (method, version_id) of

560

# versions that is needed to be fetched to construct the final

561

# version of the file.

562

563

# basis_revisions is a list of versions that needs to be

564

# fetched but exists in the basis knit.

565

566

needed_versions, basis_versions = \

567

self._get_component_versions(version_id)

568

569

components = {}

570

if basis_versions:

571

assert False, "I am broken"

572

basis = self.basis_knit

573

records = []

574

for comp_id in basis_versions:

575

data_pos, data_size = basis._index.get_data_position(comp_id)

576

records.append((comp_id, data_pos, data_size))

577

components.update(basis._data.read_records(records))

578

579

records = []

580

for comp_id in [vid for method, vid in needed_versions

581

if vid not in basis_versions]:

582

data_pos, data_size = self._index.get_position(comp_id)

583

records.append((comp_id, data_pos, data_size))

584

components.update(self._data.read_records(records))

585

586

# get_data_records returns a mapping with the version id as

587

# index and the value as data. The order the components need

588

# to be applied is held by needed_versions (reversed).

589

out = []

590

for method, comp_id in reversed(needed_versions):

591

out.append((comp_id, method, components[comp_id]))

592

593

return out

594

595

def _get_content(self, version_id, parent_texts={}):

596

"""Returns a content object that makes up the specified

597

version."""

598

if not self.has_version(version_id):

599

raise RevisionNotPresent(version_id, self.filename)

600

601

cached_version = parent_texts.get(version_id, None)

602

if cached_version is not None:

603

return cached_version

604

605

if self.basis_knit and version_id in self.basis_knit:

606

return self.basis_knit._get_content(version_id)

607

608

content = None

609

components = self._get_components(version_id)

610

for component_id, method, (data, digest) in components:

611

version_idx = self._index.lookup(component_id)

612

if method == 'fulltext':

613

assert content is None

614

content = self.factory.parse_fulltext(data, version_idx)

615

elif method == 'line-delta':

616

delta = self.factory.parse_line_delta(data, version_idx)

617

content._lines = self._apply_delta(content._lines, delta)

618

619

if 'no-eol' in self._index.get_options(version_id):

620

line = content._lines[-1][1].rstrip('\n')

621

content._lines[-1] = (content._lines[-1][0], line)

622

623

# digest here is the digest from the last applied component.

624

if sha_strings(content.text()) != digest:

625

raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)

626

627

return content

628

629

def _check_versions_present(self, version_ids):

630

"""Check that all specified versions are present."""

631

version_ids = set(version_ids)

632

for r in list(version_ids):

633

if self._index.has_version(r):

634

version_ids.remove(r)

635

if version_ids:

636

raise RevisionNotPresent(list(version_ids)[0], self.filename)

637

638

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

639

"""See VersionedFile.add_lines_with_ghosts()."""

640

self._check_add(version_id, lines)

641

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

642

643

def _add_lines(self, version_id, parents, lines, parent_texts):

644

"""See VersionedFile.add_lines."""

645

self._check_add(version_id, lines)

646

self._check_versions_present(parents)

647

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

648

649

def _check_add(self, version_id, lines):

650

"""check that version_id and lines are safe to add."""

651

assert self.writable, "knit is not opened for write"

652

### FIXME escape. RBC 20060228

653

if contains_whitespace(version_id):

654

raise InvalidRevisionId(version_id, self.filename)

655

if self.has_version(version_id):

656

raise RevisionAlreadyPresent(version_id, self.filename)

657

self._check_lines_not_unicode(lines)

658

self._check_lines_are_lines(lines)

659

660

def _add(self, version_id, lines, parents, delta, parent_texts):

661

"""Add a set of lines on top of version specified by parents.

662

663

If delta is true, compress the text as a line-delta against

664

the first parent.

665

666

Any versions not present will be converted into ghosts.

667

"""

668

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

669

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

670

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

671

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

672

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

673

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

674

# +1383 0 8.0370 8.0370 +<len>

675

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

676

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

677

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

678

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

679

680

present_parents = []

681

ghosts = []

682

if parent_texts is None:

683

parent_texts = {}

684

for parent in parents:

685

if not self.has_version(parent):

686

ghosts.append(parent)

687

else:

688

present_parents.append(parent)

689

690

if delta and not len(present_parents):

691

delta = False

692

693

digest = sha_strings(lines)

694

options = []

695

if lines:

696

if lines[-1][-1] != '\n':

697

options.append('no-eol')

698

lines[-1] = lines[-1] + '\n'

699

700

if len(present_parents) and delta:

701

# To speed the extract of texts the delta chain is limited

702

# to a fixed number of deltas. This should minimize both

703

# I/O and the time spend applying deltas.

704

count = 0

705

delta_parents = present_parents

706

while count < 25:

707

parent = delta_parents[0]

708

method = self._index.get_method(parent)

709

if method == 'fulltext':

710

break

711

delta_parents = self._index.get_parents(parent)

712

count = count + 1

713

if method == 'line-delta':

714

delta = False

715

716

lines = self.factory.make(lines, version_id)

717

if delta or (self.factory.annotated and len(present_parents) > 0):

718

# Merge annotations from parent texts if so is needed.

719

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

720

delta, self.factory.annotated)

721

722

if delta:

723

options.append('line-delta')

724

store_lines = self.factory.lower_line_delta(delta_hunks)

725

else:

726

options.append('fulltext')

727

store_lines = self.factory.lower_fulltext(lines)

728

729

where, size = self._data.add_record(version_id, digest, store_lines)

730

self._index.add_version(version_id, options, where, size, parents)

731

return lines

732

733

def check(self, progress_bar=None):

734

"""See VersionedFile.check()."""

735

736

def _clone_text(self, new_version_id, old_version_id, parents):

737

"""See VersionedFile.clone_text()."""

738

# FIXME RBC 20060228 make fast by only inserting an index with null

739

# delta.

740

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

741

742

def get_lines(self, version_id):

743

"""See VersionedFile.get_lines()."""

744

return self.get_line_list([version_id])[0]

745

746

def _get_version_components(self, position_map):

747

records = []

748

for version_id, positions in position_map.iteritems():

749

for method, comp_id, position, size in positions:

750

records.append((comp_id, position, size))

751

record_map = self._data.read_records(records)

752

753

component_map = {}

754

for version_id, positions in position_map.iteritems():

755

components = []

756

for method, comp_id, position, size in positions:

757

data, digest = record_map[comp_id]

758

components.append((comp_id, method, data, digest))

759

component_map[version_id] = components

760

return component_map

761

762

def get_text(self, version_id):

763

"""See VersionedFile.get_text"""

764

return self.get_texts([version_id])[0]

765

766

def get_texts(self, version_ids):

767

return [''.join(l) for l in self.get_line_list(version_ids)]

768

769

def get_line_list(self, version_ids):

770

"""Return the texts of listed versions as a list of strings."""

771

position_map = {}

772

for version_id in version_ids:

773

if not self.has_version(version_id):

774

raise RevisionNotPresent(version_id, self.filename)

775

position_map[version_id] = \

776

self._get_component_positions(version_id)

777

778

version_components = self._get_version_components(position_map).items()

779

780

text_map = {}

781

for version_id, components in version_components:

782

content = None

783

for component_id, method, data, digest in reversed(components):

784

version_idx = self._index.lookup(component_id)

785

if method == 'fulltext':

786

assert content is None

787

content = self.factory.parse_fulltext(data, version_idx)

788

elif method == 'line-delta':

789

delta = self.factory.parse_line_delta(data, version_idx)

790

content._lines = self._apply_delta(content._lines, delta)

791

792

if 'no-eol' in self._index.get_options(version_id):

793

line = content._lines[-1][1].rstrip('\n')

794

content._lines[-1] = (content._lines[-1][0], line)

795

796

# digest here is the digest from the last applied component.

797

if sha_strings(content.text()) != digest:

798

raise KnitCorrupt(self.filename,

799

'sha-1 does not match %s' % version_id)

800

801

text_map[version_id] = content.text()

802

return [text_map[v] for v in version_ids]

803

804

def iter_lines_added_or_present_in_versions(self, version_ids=None):

805

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

806

if version_ids is None:

807

version_ids = self.versions()

808

# we don't care about inclusions, the caller cares.

809

# but we need to setup a list of records to visit.

810

# we need version_id, position, length

811

version_id_records = []

812

requested_versions = list(version_ids)

813

# filter for available versions

814

for version_id in requested_versions:

815

if not self.has_version(version_id):

816

raise RevisionNotPresent(version_id, self.filename)

817

# get a in-component-order queue:

818

version_ids = []

819

for version_id in self.versions():

820

if version_id in requested_versions:

821

version_ids.append(version_id)

822

data_pos, length = self._index.get_position(version_id)

823

version_id_records.append((version_id, data_pos, length))

824

825

pb = bzrlib.ui.ui_factory.nested_progress_bar()

826

count = 0

827

total = len(version_id_records)

828

try:

829

pb.update('Walking content.', count, total)

830

for version_id, data, sha_value in \

831

self._data.read_records_iter(version_id_records):

832

pb.update('Walking content.', count, total)

833

method = self._index.get_method(version_id)

834

version_idx = self._index.lookup(version_id)

835

assert method in ('fulltext', 'line-delta')

836

if method == 'fulltext':

837

content = self.factory.parse_fulltext(data, version_idx)

838

for line in content.text():

839

yield line

840

else:

841

delta = self.factory.parse_line_delta(data, version_idx)

842

for start, end, count, lines in delta:

843

for origin, line in lines:

844

yield line

845

count +=1

846

pb.update('Walking content.', total, total)

847

pb.finished()

848

except:

849

pb.update('Walking content.', total, total)

850

pb.finished()

851

raise

852

853

def num_versions(self):

854

"""See VersionedFile.num_versions()."""

855

return self._index.num_versions()

856

857

__len__ = num_versions

858

859

def annotate_iter(self, version_id):

860

"""See VersionedFile.annotate_iter."""

861

content = self._get_content(version_id)

862

for origin, text in content.annotate_iter():

863

yield origin, text

864

865

def get_parents(self, version_id):

866

"""See VersionedFile.get_parents."""

867

# perf notes:

868

# optimism counts!

869

# 52554 calls in 1264 872 internal down from 3674

870

try:

871

return self._index.get_parents(version_id)

872

except KeyError:

873

raise RevisionNotPresent(version_id, self.filename)

874

875

def get_parents_with_ghosts(self, version_id):

876

"""See VersionedFile.get_parents."""

877

try:

878

return self._index.get_parents_with_ghosts(version_id)

879

except KeyError:

880

raise RevisionNotPresent(version_id, self.filename)

881

882

def get_ancestry(self, versions):

883

"""See VersionedFile.get_ancestry."""

884

if isinstance(versions, basestring):

885

versions = [versions]

886

if not versions:

887

return []

888

self._check_versions_present(versions)

889

return self._index.get_ancestry(versions)

890

891

def get_ancestry_with_ghosts(self, versions):

892

"""See VersionedFile.get_ancestry_with_ghosts."""

893

if isinstance(versions, basestring):

894

versions = [versions]

895

if not versions:

896

return []

897

self._check_versions_present(versions)

898

return self._index.get_ancestry_with_ghosts(versions)

899

900

#@deprecated_method(zero_eight)

901

def walk(self, version_ids):

902

"""See VersionedFile.walk."""

903

# We take the short path here, and extract all relevant texts

904

# and put them in a weave and let that do all the work. Far

905

# from optimal, but is much simpler.

906

# FIXME RB 20060228 this really is inefficient!

907

from bzrlib.weave import Weave

908

909

w = Weave(self.filename)

910

ancestry = self.get_ancestry(version_ids)

911

sorted_graph = topo_sort(self._index.get_graph())

912

version_list = [vid for vid in sorted_graph if vid in ancestry]

913

914

for version_id in version_list:

915

lines = self.get_lines(version_id)

916

w.add_lines(version_id, self.get_parents(version_id), lines)

917

918

for lineno, insert_id, dset, line in w.walk(version_ids):

919

yield lineno, insert_id, dset, line

920

921

def plan_merge(self, ver_a, ver_b):

922

"""See VersionedFile.plan_merge."""

923

ancestors_b = set(self.get_ancestry(ver_b))

924

def status_a(revision, text):

925

if revision in ancestors_b:

926

return 'killed-b', text

927

else:

928

return 'new-a', text

929

930

ancestors_a = set(self.get_ancestry(ver_a))

931

def status_b(revision, text):

932

if revision in ancestors_a:

933

return 'killed-a', text

934

else:

935

return 'new-b', text

936

937

annotated_a = self.annotate(ver_a)

938

annotated_b = self.annotate(ver_b)

939

plain_a = [t for (a, t) in annotated_a]

940

plain_b = [t for (a, t) in annotated_b]

941

blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

942

a_cur = 0

943

b_cur = 0

944

for ai, bi, l in blocks:

945

# process all mismatched sections

946

# (last mismatched section is handled because blocks always

947

# includes a 0-length last block)

948

for revision, text in annotated_a[a_cur:ai]:

949

yield status_a(revision, text)

950

for revision, text in annotated_b[b_cur:bi]:

951

yield status_b(revision, text)

952

953

# and now the matched section

954

a_cur = ai + l

955

b_cur = bi + l

956

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

957

assert text_a == text_b

958

yield "unchanged", text_a

959

960

961

class _KnitComponentFile(object):

962

"""One of the files used to implement a knit database"""

963

964

def __init__(self, transport, filename, mode, file_mode=None):

965

self._transport = transport

966

self._filename = filename

967

self._mode = mode

968

self._file_mode=file_mode

969

970

def write_header(self):

971

if self._transport.append(self._filename, StringIO(self.HEADER),

972

mode=self._file_mode):

973

raise KnitCorrupt(self._filename, 'misaligned after writing header')

974

975

def check_header(self, fp):

976

line = fp.readline()

977

if line != self.HEADER:

978

raise KnitHeaderError(badline=line)

979

980

def commit(self):

981

"""Commit is a nop."""

982

983

def __repr__(self):

984

return '%s(%s)' % (self.__class__.__name__, self._filename)

985

986

987

class _KnitIndex(_KnitComponentFile):

988

"""Manages knit index file.

989

990

The index is already kept in memory and read on startup, to enable

991

fast lookups of revision information. The cursor of the index

992

file is always pointing to the end, making it easy to append

993

entries.

994

995

_cache is a cache for fast mapping from version id to a Index

996

object.

997

998

_history is a cache for fast mapping from indexes to version ids.

999

1000

The index data format is dictionary compressed when it comes to

1001

parent references; a index entry may only have parents that with a

1002

lover index number. As a result, the index is topological sorted.

1003

1004

Duplicate entries may be written to the index for a single version id

1005

if this is done then the latter one completely replaces the former:

1006

this allows updates to correct version and parent information.

1007

Note that the two entries may share the delta, and that successive

1008

annotations and references MUST point to the first entry.

1009

1010

The index file on disc contains a header, followed by one line per knit

1011

record. The same revision can be present in an index file more than once.

1012

The first occurrence gets assigned a sequence number starting from 0.

1013

1014

The format of a single line is

1015

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

1016

REVISION_ID is a utf8-encoded revision id

1017

FLAGS is a comma separated list of flags about the record. Values include

1018

no-eol, line-delta, fulltext.

1019

BYTE_OFFSET is the ascii representation of the byte offset in the data file

1020

that the the compressed data starts at.

1021

LENGTH is the ascii representation of the length of the data file.

1022

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

1023

REVISION_ID.

1024

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

1025

revision id already in the knit that is a parent of REVISION_ID.

1026

The ' :' marker is the end of record marker.

1027

1028

partial writes:

1029

when a write is interrupted to the index file, it will result in a line that

1030

does not end in ' :'. If the ' :' is not present at the end of a line, or at

1031

the end of the file, then the record that is missing it will be ignored by

1032

the parser.

1033

1034

When writing new records to the index file, the data is preceded by '\n'

1035

to ensure that records always start on new lines even if the last write was

1036

interrupted. As a result its normal for the last line in the index to be

1037

missing a trailing newline. One can be added with no harmful effects.

1038

"""

1039

1040

HEADER = "# bzr knit index 8\n"

1041

1042

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

1043

# __slots__ = ['_cache', '_history', '_transport', '_filename']

1044

1045

def _cache_version(self, version_id, options, pos, size, parents):

1046

"""Cache a version record in the history array and index cache.

1047

1048

This is inlined into __init__ for performance. KEEP IN SYNC.

1049

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1050

indexes).

1051

"""

1052

# only want the _history index to reference the 1st index entry

1053

# for version_id

1054

if version_id not in self._cache:

1055

index = len(self._history)

1056

self._history.append(version_id)

1057

else:

1058

index = self._cache[version_id][5]

1059

self._cache[version_id] = (version_id,

1060

options,

1061

pos,

1062

size,

1063

parents,

1064

index)

1065

1066

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1067

_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)

1068

self._cache = {}

1069

# position in _history is the 'official' index for a revision

1070

# but the values may have come from a newer entry.

1071

# so - wc -l of a knit index is != the number of unique names

1072

# in the knit.

1073

self._history = []

1074

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1075

try:

1076

count = 0

1077

total = 1

1078

try:

1079

pb.update('read knit index', count, total)

1080

fp = self._transport.get(self._filename)

1081

self.check_header(fp)

1082

# readlines reads the whole file at once:

1083

# bad for transports like http, good for local disk

1084

# we save 60 ms doing this one change (

1085

# from calling readline each time to calling

1086

# readlines once.

1087

# probably what we want for nice behaviour on

1088

# http is a incremental readlines that yields, or

1089

# a check for local vs non local indexes,

1090

for l in fp.readlines():

1091

rec = l.split()

1092

if len(rec) < 5 or rec[-1] != ':':

1093

# corrupt line.

1094

# FIXME: in the future we should determine if its a

1095

# short write - and ignore it

1096

# or a different failure, and raise. RBC 20060407

1097

continue

1098

count += 1

1099

total += 1

1100

#pb.update('read knit index', count, total)

1101

# See self._parse_parents

1102

parents = []

1103

for value in rec[4:-1]:

1104

if '.' == value[0]:

1105

# uncompressed reference

1106

parents.append(value[1:])

1107

else:

1108

# this is 15/4000ms faster than isinstance,

1109

# (in lsprof)

1110

# this function is called thousands of times a

1111

# second so small variations add up.

1112

assert value.__class__ is str

1113

parents.append(self._history[int(value)])

1114

# end self._parse_parents

1115

# self._cache_version(rec[0],

1116

# rec[1].split(','),

1117

# int(rec[2]),

1118

# int(rec[3]),

1119

# parents)

1120

# --- self._cache_version

1121

# only want the _history index to reference the 1st

1122

# index entry for version_id

1123

version_id = rec[0]

1124

if version_id not in self._cache:

1125

index = len(self._history)

1126

self._history.append(version_id)

1127

else:

1128

index = self._cache[version_id][5]

1129

self._cache[version_id] = (version_id,

1130

rec[1].split(','),

1131

int(rec[2]),

1132

int(rec[3]),

1133

parents,

1134

index)

1135

# --- self._cache_version

1136

except NoSuchFile, e:

1137

if mode != 'w' or not create:

1138

raise

1139

self.write_header()

1140

finally:

1141

pb.update('read knit index', total, total)

1142

pb.finished()

1143

1144

def _parse_parents(self, compressed_parents):

1145

"""convert a list of string parent values into version ids.

1146

1147

ints are looked up in the index.

1148

.FOO values are ghosts and converted in to FOO.

1149

1150

NOTE: the function is retained here for clarity, and for possible

1151

use in partial index reads. However bulk processing now has

1152

it inlined in __init__ for inner-loop optimisation.

1153

"""

1154

result = []

1155

for value in compressed_parents:

1156

if value[-1] == '.':

1157

# uncompressed reference

1158

result.append(value[1:])

1159

else:

1160

# this is 15/4000ms faster than isinstance,

1161

# this function is called thousands of times a

1162

# second so small variations add up.

1163

assert value.__class__ is str

1164

result.append(self._history[int(value)])

1165

return result

1166

1167

def get_graph(self):

1168

graph = []

1169

for version_id, index in self._cache.iteritems():

1170

graph.append((version_id, index[4]))

1171

return graph

1172

1173

def get_ancestry(self, versions):

1174

"""See VersionedFile.get_ancestry."""

1175

# get a graph of all the mentioned versions:

1176

graph = {}

1177

pending = set(versions)

1178

while len(pending):

1179

version = pending.pop()

1180

parents = self._cache[version][4]

1181

# got the parents ok

1182

# trim ghosts

1183

parents = [parent for parent in parents if parent in self._cache]

1184

for parent in parents:

1185

# if not completed and not a ghost

1186

if parent not in graph:

1187

pending.add(parent)

1188

graph[version] = parents

1189

return topo_sort(graph.items())

1190

1191

def get_ancestry_with_ghosts(self, versions):

1192

"""See VersionedFile.get_ancestry_with_ghosts."""

1193

# get a graph of all the mentioned versions:

1194

graph = {}

1195

pending = set(versions)

1196

while len(pending):

1197

version = pending.pop()

1198

try:

1199

parents = self._cache[version][4]

1200

except KeyError:

1201

# ghost, fake it

1202

graph[version] = []

1203

pass

1204

else:

1205

# got the parents ok

1206

for parent in parents:

1207

if parent not in graph:

1208

pending.add(parent)

1209

graph[version] = parents

1210

return topo_sort(graph.items())

1211

1212

def num_versions(self):

1213

return len(self._history)

1214

1215

__len__ = num_versions

1216

1217

def get_versions(self):

1218

return self._history

1219

1220

def idx_to_name(self, idx):

1221

return self._history[idx]

1222

1223

def lookup(self, version_id):

1224

assert version_id in self._cache

1225

return self._cache[version_id][5]

1226

1227

def _version_list_to_index(self, versions):

1228

result_list = []

1229

for version in versions:

1230

if version in self._cache:

1231

# -- inlined lookup() --

1232

result_list.append(str(self._cache[version][5]))

1233

# -- end lookup () --

1234

else:

1235

result_list.append('.' + version.encode('utf-8'))

1236

return ' '.join(result_list)

1237

1238

def add_version(self, version_id, options, pos, size, parents):

1239

"""Add a version record to the index."""

1240

self.add_versions(((version_id, options, pos, size, parents),))

1241

1242

def add_versions(self, versions):

1243

"""Add multiple versions to the index.

1244

1245

:param versions: a list of tuples:

1246

(version_id, options, pos, size, parents).

1247

"""

1248

lines = []

1249

for version_id, options, pos, size, parents in versions:

1250

line = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),

1251

','.join(options),

1252

pos,

1253

size,

1254

self._version_list_to_index(parents))

1255

assert isinstance(line, str), \

1256

'content must be utf-8 encoded: %r' % (line,)

1257

lines.append(line)

1258

self._transport.append(self._filename, StringIO(''.join(lines)))

1259

# cache after writing, so that a failed write leads to missing cache

1260

# entries not extra ones. XXX TODO: RBC 20060502 in the event of a

1261

# failure, reload the index or flush it or some such, to prevent

1262

# writing records that did complete twice.

1263

for version_id, options, pos, size, parents in versions:

1264

self._cache_version(version_id, options, pos, size, parents)

1265

1266

def has_version(self, version_id):

1267

"""True if the version is in the index."""

1268

return self._cache.has_key(version_id)

1269

1270

def get_position(self, version_id):

1271

"""Return data position and size of specified version."""

1272

return (self._cache[version_id][2], \

1273

self._cache[version_id][3])

1274

1275

def get_method(self, version_id):

1276

"""Return compression method of specified version."""

1277

options = self._cache[version_id][1]

1278

if 'fulltext' in options:

1279

return 'fulltext'

1280

else:

1281

assert 'line-delta' in options

1282

return 'line-delta'

1283

1284

def get_options(self, version_id):

1285

return self._cache[version_id][1]

1286

1287

def get_parents(self, version_id):

1288

"""Return parents of specified version ignoring ghosts."""

1289

return [parent for parent in self._cache[version_id][4]

1290

if parent in self._cache]

1291

1292

def get_parents_with_ghosts(self, version_id):

1293

"""Return parents of specified version with ghosts."""

1294

return self._cache[version_id][4]

1295

1296

def check_versions_present(self, version_ids):

1297

"""Check that all specified versions are present."""

1298

version_ids = set(version_ids)

1299

for version_id in list(version_ids):

1300

if version_id in self._cache:

1301

version_ids.remove(version_id)

1302

if version_ids:

1303

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1304

1305

1306

class _KnitData(_KnitComponentFile):

1307

"""Contents of the knit data file"""

1308

1309

HEADER = "# bzr knit data 8\n"

1310

1311

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1312

_KnitComponentFile.__init__(self, transport, filename, mode)

1313

self._file = None

1314

self._checked = False

1315

if create:

1316

self._transport.put(self._filename, StringIO(''), mode=file_mode)

1317

self._records = {}

1318

1319

def clear_cache(self):

1320

"""Clear the record cache."""

1321

self._records = {}

1322

1323

def _open_file(self):

1324

if self._file is None:

1325

try:

1326

self._file = self._transport.get(self._filename)

1327

except NoSuchFile:

1328

pass

1329

return self._file

1330

1331

def _record_to_data(self, version_id, digest, lines):

1332

"""Convert version_id, digest, lines into a raw data block.

1333

1334

:return: (len, a StringIO instance with the raw data ready to read.)

1335

"""

1336

sio = StringIO()

1337

data_file = GzipFile(None, mode='wb', fileobj=sio)

1338

data_file.writelines(chain(

1339

["version %s %d %s\n" % (version_id.encode('utf-8'),

1340

len(lines),

1341

digest)],

1342

lines,

1343

["end %s\n" % version_id.encode('utf-8')]))

1344

data_file.close()

1345

length= sio.tell()

1346

1347

sio.seek(0)

1348

return length, sio

1349

1350

def add_raw_record(self, raw_data):

1351

"""Append a prepared record to the data file.

1352

1353

:return: the offset in the data file raw_data was written.

1354

"""

1355

assert isinstance(raw_data, str), 'data must be plain bytes'

1356

return self._transport.append(self._filename, StringIO(raw_data))

1357

1358

def add_record(self, version_id, digest, lines):

1359

"""Write new text record to disk. Returns the position in the

1360

file where it was written."""

1361

size, sio = self._record_to_data(version_id, digest, lines)

1362

# cache

1363

self._records[version_id] = (digest, lines)

1364

# write to disk

1365

start_pos = self._transport.append(self._filename, sio)

1366

return start_pos, size

1367

1368

def _parse_record_header(self, version_id, raw_data):

1369

"""Parse a record header for consistency.

1370

1371

:return: the header and the decompressor stream.

1372

as (stream, header_record)

1373

"""

1374

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1375

rec = df.readline().split()

1376

if len(rec) != 4:

1377

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1378

if rec[1].decode('utf-8')!= version_id:

1379

raise KnitCorrupt(self._filename,

1380

'unexpected version, wanted %r, got %r' % (

1381

version_id, rec[1]))

1382

return df, rec

1383

1384

def _parse_record(self, version_id, data):

1385

# profiling notes:

1386

# 4168 calls in 2880 217 internal

1387

# 4168 calls to _parse_record_header in 2121

1388

# 4168 calls to readlines in 330

1389

df, rec = self._parse_record_header(version_id, data)

1390

record_contents = df.readlines()

1391

l = record_contents.pop()

1392

assert len(record_contents) == int(rec[2])

1393

if l.decode('utf-8') != 'end %s\n' % version_id:

1394

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1395

% (l, version_id))

1396

df.close()

1397

return record_contents, rec[3]

1398

1399

def read_records_iter_raw(self, records):

1400

"""Read text records from data file and yield raw data.

1401

1402

This unpacks enough of the text record to validate the id is

1403

as expected but thats all.

1404

1405

It will actively recompress currently cached records on the

1406

basis that that is cheaper than I/O activity.

1407

"""

1408

needed_records = []

1409

for version_id, pos, size in records:

1410

if version_id not in self._records:

1411

needed_records.append((version_id, pos, size))

1412

1413

# setup an iterator of the external records:

1414

# uses readv so nice and fast we hope.

1415

if len(needed_records):

1416

# grab the disk data needed.

1417

raw_records = self._transport.readv(self._filename,

1418

[(pos, size) for version_id, pos, size in needed_records])

1419

1420

for version_id, pos, size in records:

1421

if version_id in self._records:

1422

# compress a new version

1423

size, sio = self._record_to_data(version_id,

1424

self._records[version_id][0],

1425

self._records[version_id][1])

1426

yield version_id, sio.getvalue()

1427

else:

1428

pos, data = raw_records.next()

1429

# validate the header

1430

df, rec = self._parse_record_header(version_id, data)

1431

df.close()

1432

yield version_id, data

1433

1434

1435

def read_records_iter(self, records):

1436

"""Read text records from data file and yield result.

1437

1438

Each passed record is a tuple of (version_id, pos, len) and

1439

will be read in the given order. Yields (version_id,

1440

contents, digest).

1441

"""

1442

# profiling notes:

1443

# 60890 calls for 4168 extractions in 5045, 683 internal.

1444

# 4168 calls to readv in 1411

1445

# 4168 calls to parse_record in 2880

1446

1447

needed_records = []

1448

for version_id, pos, size in records:

1449

if version_id not in self._records:

1450

needed_records.append((version_id, pos, size))

1451

1452

if len(needed_records):

1453

needed_records.sort(key=operator.itemgetter(1))

1454

# We take it that the transport optimizes the fetching as good

1455

# as possible (ie, reads continuous ranges.)

1456

response = self._transport.readv(self._filename,

1457

[(pos, size) for version_id, pos, size in needed_records])

1458

1459

for (record_id, pos, size), (pos, data) in \

1460

izip(iter(needed_records), response):

1461

content, digest = self._parse_record(record_id, data)

1462

self._records[record_id] = (digest, content)

1463

1464

for version_id, pos, size in records:

1465

yield version_id, list(self._records[version_id][1]), self._records[version_id][0]

1466

1467

def read_records(self, records):

1468

"""Read records into a dictionary."""

1469

components = {}

1470

for record_id, content, digest in self.read_records_iter(records):

1471

components[record_id] = (content, digest)

1472

return components

1473

1474

1475

class InterKnit(InterVersionedFile):

1476

"""Optimised code paths for knit to knit operations."""

1477

1478

_matching_file_from_factory = KnitVersionedFile

1479

_matching_file_to_factory = KnitVersionedFile

1480

1481

@staticmethod

1482

def is_compatible(source, target):

1483

"""Be compatible with knits. """

1484

try:

1485

return (isinstance(source, KnitVersionedFile) and

1486

isinstance(target, KnitVersionedFile))

1487

except AttributeError:

1488

return False

1489

1490

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1491

"""See InterVersionedFile.join."""

1492

assert isinstance(self.source, KnitVersionedFile)

1493

assert isinstance(self.target, KnitVersionedFile)

1494

1495

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1496

1497

if not version_ids:

1498

return 0

1499

1500

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1501

try:

1502

version_ids = list(version_ids)

1503

if None in version_ids:

1504

version_ids.remove(None)

1505

1506

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1507

this_versions = set(self.target._index.get_versions())

1508

needed_versions = self.source_ancestry - this_versions

1509

cross_check_versions = self.source_ancestry.intersection(this_versions)

1510

mismatched_versions = set()

1511

for version in cross_check_versions:

1512

# scan to include needed parents.

1513

n1 = set(self.target.get_parents_with_ghosts(version))

1514

n2 = set(self.source.get_parents_with_ghosts(version))

1515

if n1 != n2:

1516

# FIXME TEST this check for cycles being introduced works

1517

# the logic is we have a cycle if in our graph we are an

1518

# ancestor of any of the n2 revisions.

1519

for parent in n2:

1520

if parent in n1:

1521

# safe

1522

continue

1523

else:

1524

parent_ancestors = self.source.get_ancestry(parent)

1525

if version in parent_ancestors:

1526

raise errors.GraphCycleError([parent, version])

1527

# ensure this parent will be available later.

1528

new_parents = n2.difference(n1)

1529

needed_versions.update(new_parents.difference(this_versions))

1530

mismatched_versions.add(version)

1531

1532

if not needed_versions and not mismatched_versions:

1533

return 0

1534

full_list = topo_sort(self.source.get_graph())

1535

1536

version_list = [i for i in full_list if (not self.target.has_version(i)

1537

and i in needed_versions)]

1538

1539

# plan the join:

1540

copy_queue = []

1541

copy_queue_records = []

1542

copy_set = set()

1543

for version_id in version_list:

1544

options = self.source._index.get_options(version_id)

1545

parents = self.source._index.get_parents_with_ghosts(version_id)

1546

# check that its will be a consistent copy:

1547

for parent in parents:

1548

# if source has the parent, we must :

1549

# * already have it or

1550

# * have it scheduled already

1551

# otherwise we don't care

1552

assert (self.target.has_version(parent) or

1553

parent in copy_set or

1554

not self.source.has_version(parent))

1555

data_pos, data_size = self.source._index.get_position(version_id)

1556

copy_queue_records.append((version_id, data_pos, data_size))

1557

copy_queue.append((version_id, options, parents))

1558

copy_set.add(version_id)

1559

1560

# data suck the join:

1561

count = 0

1562

total = len(version_list)

1563

raw_datum = []

1564

raw_records = []

1565

for (version_id, raw_data), \

1566

(version_id2, options, parents) in \

1567

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1568

copy_queue):

1569

assert version_id == version_id2, 'logic error, inconsistent results'

1570

count = count + 1

1571

pb.update("Joining knit", count, total)

1572

raw_records.append((version_id, options, parents, len(raw_data)))

1573

raw_datum.append(raw_data)

1574

self.target._add_raw_records(raw_records, ''.join(raw_datum))

1575

1576

for version in mismatched_versions:

1577

# FIXME RBC 20060309 is this needed?

1578

n1 = set(self.target.get_parents_with_ghosts(version))

1579

n2 = set(self.source.get_parents_with_ghosts(version))

1580

# write a combined record to our history preserving the current

1581

# parents as first in the list

1582

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1583

self.target.fix_parents(version, new_parents)

1584

return count

1585

finally:

1586

pb.finished()

1587

1588

1589

InterVersionedFile.register_optimiser(InterKnit)

1590

1591

1592

class WeaveToKnit(InterVersionedFile):

1593

"""Optimised code paths for weave to knit operations."""

1594

1595

_matching_file_from_factory = bzrlib.weave.WeaveFile

1596

_matching_file_to_factory = KnitVersionedFile

1597

1598

@staticmethod

1599

def is_compatible(source, target):

1600

"""Be compatible with weaves to knits."""

1601

try:

1602

return (isinstance(source, bzrlib.weave.Weave) and

1603

isinstance(target, KnitVersionedFile))

1604

except AttributeError:

1605

return False

1606

1607

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1608

"""See InterVersionedFile.join."""

1609

assert isinstance(self.source, bzrlib.weave.Weave)

1610

assert isinstance(self.target, KnitVersionedFile)

1611

1612

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1613

1614

if not version_ids:

1615

return 0

1616

1617

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1618

try:

1619

version_ids = list(version_ids)

1620

1621

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1622

this_versions = set(self.target._index.get_versions())

1623

needed_versions = self.source_ancestry - this_versions

1624

cross_check_versions = self.source_ancestry.intersection(this_versions)

1625

mismatched_versions = set()

1626

for version in cross_check_versions:

1627

# scan to include needed parents.

1628

n1 = set(self.target.get_parents_with_ghosts(version))

1629

n2 = set(self.source.get_parents(version))

1630

# if all of n2's parents are in n1, then its fine.

1631

if n2.difference(n1):

1632

# FIXME TEST this check for cycles being introduced works

1633

# the logic is we have a cycle if in our graph we are an

1634

# ancestor of any of the n2 revisions.

1635

for parent in n2:

1636

if parent in n1:

1637

# safe

1638

continue

1639

else:

1640

parent_ancestors = self.source.get_ancestry(parent)

1641

if version in parent_ancestors:

1642

raise errors.GraphCycleError([parent, version])

1643

# ensure this parent will be available later.

1644

new_parents = n2.difference(n1)

1645

needed_versions.update(new_parents.difference(this_versions))

1646

mismatched_versions.add(version)

1647

1648

if not needed_versions and not mismatched_versions:

1649

return 0

1650

full_list = topo_sort(self.source.get_graph())

1651

1652

version_list = [i for i in full_list if (not self.target.has_version(i)

1653

and i in needed_versions)]

1654

1655

# do the join:

1656

count = 0

1657

total = len(version_list)

1658

for version_id in version_list:

1659

pb.update("Converting to knit", count, total)

1660

parents = self.source.get_parents(version_id)

1661

# check that its will be a consistent copy:

1662

for parent in parents:

1663

# if source has the parent, we must already have it

1664

assert (self.target.has_version(parent))

1665

self.target.add_lines(

1666

version_id, parents, self.source.get_lines(version_id))

1667

count = count + 1

1668

1669

for version in mismatched_versions:

1670

# FIXME RBC 20060309 is this needed?

1671

n1 = set(self.target.get_parents_with_ghosts(version))

1672

n2 = set(self.source.get_parents(version))

1673

# write a combined record to our history preserving the current

1674

# parents as first in the list

1675

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1676

self.target.fix_parents(version, new_parents)

1677

return count

1678

finally:

1679

pb.finished()

1680

1681

1682

InterVersionedFile.register_optimiser(WeaveToKnit)

1683

1684

1685

class KnitSequenceMatcher(difflib.SequenceMatcher):

1686

"""Knit tuned sequence matcher.

1687

1688

This is based on profiling of difflib which indicated some improvements

1689

for our usage pattern.

1690

"""

1691

1692

def find_longest_match(self, alo, ahi, blo, bhi):

1693

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1694

1695

If isjunk is not defined:

1696

1697

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1698

alo <= i <= i+k <= ahi

1699

blo <= j <= j+k <= bhi

1700

and for all (i',j',k') meeting those conditions,

1701

k >= k'

1702

i <= i'

1703

and if i == i', j <= j'

1704

1705

In other words, of all maximal matching blocks, return one that

1706

starts earliest in a, and of all those maximal matching blocks that

1707

start earliest in a, return the one that starts earliest in b.

1708

1709

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1710

>>> s.find_longest_match(0, 5, 0, 9)

1711

(0, 4, 5)

1712

1713

If isjunk is defined, first the longest matching block is

1714

determined as above, but with the additional restriction that no

1715

junk element appears in the block. Then that block is extended as

1716

far as possible by matching (only) junk elements on both sides. So

1717

the resulting block never matches on junk except as identical junk

1718

happens to be adjacent to an "interesting" match.

1719

1720

Here's the same example as before, but considering blanks to be

1721

junk. That prevents " abcd" from matching the " abcd" at the tail

1722

end of the second sequence directly. Instead only the "abcd" can

1723

match, and matches the leftmost "abcd" in the second sequence:

1724

1725

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1726

>>> s.find_longest_match(0, 5, 0, 9)

1727

(1, 0, 4)

1728

1729

If no blocks match, return (alo, blo, 0).

1730

1731

>>> s = SequenceMatcher(None, "ab", "c")

1732

>>> s.find_longest_match(0, 2, 0, 1)

1733

(0, 0, 0)

1734

"""

1735

1736

# CAUTION: stripping common prefix or suffix would be incorrect.

1737

# E.g.,

1738

# ab

1739

# acab

1740

# Longest matching block is "ab", but if common prefix is

1741

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1742

# strip, so ends up claiming that ab is changed to acab by

1743

# inserting "ca" in the middle. That's minimal but unintuitive:

1744

# "it's obvious" that someone inserted "ac" at the front.

1745

# Windiff ends up at the same place as diff, but by pairing up

1746

# the unique 'b's and then matching the first two 'a's.

1747

1748

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1749

besti, bestj, bestsize = alo, blo, 0

1750

# find longest junk-free match

1751

# during an iteration of the loop, j2len[j] = length of longest

1752

# junk-free match ending with a[i-1] and b[j]

1753

j2len = {}

1754

# nothing = []

1755

b2jget = b2j.get

1756

for i in xrange(alo, ahi):

1757

# look at all instances of a[i] in b; note that because

1758

# b2j has no junk keys, the loop is skipped if a[i] is junk

1759

j2lenget = j2len.get

1760

newj2len = {}

1761

1762

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

1763

# following improvement

1764

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1765

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1766

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1767

# to

1768

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1769

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1770

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1771

1772

try:

1773

js = b2j[a[i]]

1774

except KeyError:

1775

pass

1776

else:

1777

for j in js:

1778

# a[i] matches b[j]

1779

if j >= blo:

1780

if j >= bhi:

1781

break

1782

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1783

if k > bestsize:

1784

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1785

j2len = newj2len

1786

1787

# Extend the best by non-junk elements on each end. In particular,

1788

# "popular" non-junk elements aren't in b2j, which greatly speeds

1789

# the inner loop above, but also means "the best" match so far

1790

# doesn't contain any junk *or* popular non-junk elements.

1791

while besti > alo and bestj > blo and \

1792

not isbjunk(b[bestj-1]) and \

1793

a[besti-1] == b[bestj-1]:

1794

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1795

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1796

not isbjunk(b[bestj+bestsize]) and \

1797

a[besti+bestsize] == b[bestj+bestsize]:

1798

bestsize += 1

1799

1800

# Now that we have a wholly interesting match (albeit possibly

1801

# empty!), we may as well suck up the matching junk on each

1802

# side of it too. Can't think of a good reason not to, and it

1803

# saves post-processing the (possibly considerable) expense of

1804

# figuring out what to do with it. In the case of an empty

1805

# interesting match, this is clearly the right thing to do,

1806

# because no other kind of match is possible in the regions.

1807

while besti > alo and bestj > blo and \

1808

isbjunk(b[bestj-1]) and \

1809

a[besti-1] == b[bestj-1]:

1810

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1811

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1812

isbjunk(b[bestj+bestsize]) and \

1813

a[besti+bestsize] == b[bestj+bestsize]:

1814

bestsize = bestsize + 1

1815

1816

return besti, bestj, bestsize

1817

Older »