~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-03-28 11:16:28 UTC
mto: (1626.2.1 integration)
mto: This revision was merged to the branch mainline in revision 1628.
Revision ID: robertc@robertcollins.net-20060328111628-47766b0fdfa443ab

Add MergeSort facility to bzrlib.tsort.

files added:
BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS.developers

bzrlib/annotate.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/identitymap.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/lsprof.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/store

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_uncommit.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/xml4.py

bzrlib/xml5.py

contrib/emacs

contrib/emacs/bzr-mode.el

generate_docs.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
bzrlib/changeset.py

bzrlib/mdiff.py

bzrlib/merge_core.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/statcache.py

bzrlib/upgrade.py

doc/Makefile

doc/adoption.txt

doc/bitkeeper.txt

doc/changelogs.txt

doc/cherry-picking.txt

doc/cmdref.txt

doc/common-format.txt

doc/compared-aegis.txt

doc/compared-codeville.txt

doc/compared-cvsnt.txt

doc/compared-opencm.txt

doc/compared-prcs.txt

doc/compared-teamware.txt

doc/compression.txt

doc/config-specs.txt

doc/conflicts.txt

doc/costs.txt

doc/darcs.txt

doc/deadly-sins.txt

doc/default.css

doc/design.txt

doc/extra-commands.txt

doc/formats.txt

doc/hashes.txt

doc/ignore.txt

doc/index.txt

doc/interrupted.txt

doc/intro.txt

doc/inventory.txt

doc/join-branches.txt

doc/kill-version.txt

doc/layers.txt

doc/library-interface.txt

doc/merge.txt

doc/mirroring.txt

doc/monotone.txt

doc/news.txt

doc/optional-edit.txt

doc/partial-commit.txt

doc/pool.txt

doc/purpose.txt

doc/python.txt

doc/quilt.txt

doc/quotes.txt

doc/random.txt

doc/requirements.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/revision-syntax.txt

doc/rollup.txt

doc/scalability.txt

doc/security.txt

doc/shared-branches.txt

doc/short-demo.txt

doc/supportability.txt

doc/svk.txt

doc/switch-in-branch.txt

doc/tagging.txt

doc/taxonomy.txt

doc/thanks.txt

doc/todo-from-arch.txt

doc/unchanged.txt

doc/unrelated-merge.txt

doc/usability.txt

doc/use-cases.txt

doc/web-interface.txt

doc/workflow.txt

doc/yaml.txt

notes/new-inventory-sample.xml

notes/performance.txt

patches

patches/annotate3.patch

patches/annotate4.patch

patches/cache-remote-revisions.diff

patches/find-touching-from-seq.diff

patches/meta-data-in-inventory.patch

patches/progress.diff

patches/symlink-support.patch

testbzr

files renamed:
bzrlib/store.py => bzrlib/store/__init__.py

bzrlib/selftest/ => bzrlib/tests/

bzrlib/selftest/blackbox.py => bzrlib/tests/blackbox/test_too_much.py

bzrlib/selftest/whitebox.py => bzrlib/tests/test_whitebox.py

elementtree/ => bzrlib/util/elementtree/

urlgrabber/ => bzrlib/util/urlgrabber/

bzrlib/xml.py => bzrlib/xml_serializer.py

bzrlib/newinventory.py => contrib/newinventory.py

bzr-man.py => tools/doc_generate/autodoc_man.py

files modified:
.bzrignore

.rsyncexclude

NEWS

README

TODO

build-api

bzr *

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/revision.py

bzrlib/status.py

bzrlib/tests/__init__.py

bzrlib/textinv.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/workingtree.py

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh/_bzr

setup.py *

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

import gzip

from itertools import izip, chain

import os

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.tsort import topo_sort

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accomodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

# TODO: atomically append data, then measure backwards from the cursor

# position after writing to work out where it was located. we may need to

# bypass python file buffering.

DATA_SUFFIX = '.knit'

100

INDEX_SUFFIX = '.kndx'

101

102

103

class KnitContent(object):

104

"""Content of a knit version to which deltas can be applied."""

105

106

def __init__(self, lines):

107

self._lines = lines

108

109

def annotate_iter(self):

110

"""Yield tuples of (origin, text) for each content line."""

111

for origin, text in self._lines:

112

yield origin, text

113

114

def annotate(self):

115

"""Return a list of (origin, text) tuples."""

116

return list(self.annotate_iter())

117

118

def line_delta_iter(self, new_lines):

119

"""Generate line-based delta from this content to new_lines."""

120

new_texts = [text for origin, text in new_lines._lines]

121

old_texts = [text for origin, text in self._lines]

122

s = SequenceMatcher(None, old_texts, new_texts)

123

for op in s.get_opcodes():

124

if op[0] == 'equal':

125

continue

126

# ofrom oto length data

127

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

128

129

def line_delta(self, new_lines):

130

return list(self.line_delta_iter(new_lines))

131

132

def text(self):

133

return [text for origin, text in self._lines]

134

135

136

class _KnitFactory(object):

137

"""Base factory for creating content objects."""

138

139

def make(self, lines, version):

140

num_lines = len(lines)

141

return KnitContent(zip([version] * num_lines, lines))

142

143

144

class KnitAnnotateFactory(_KnitFactory):

145

"""Factory for creating annotated Content objects."""

146

147

annotated = True

148

149

def parse_fulltext(self, content, version):

150

"""Convert fulltext to internal representation

151

152

fulltext content is of the format

153

revid(utf8) plaintext\n

154

internal representation is of the format:

155

(revid, plaintext)

156

"""

157

lines = []

158

for line in content:

159

origin, text = line.split(' ', 1)

160

lines.append((origin.decode('utf-8'), text))

161

return KnitContent(lines)

162

163

def parse_line_delta_iter(self, lines):

164

"""Convert a line based delta into internal representation.

165

166

line delta is in the form of:

167

intstart intend intcount

168

1..count lines:

169

revid(utf8) newline\n

170

internal represnetation is

171

(start, end, count, [1..count tuples (revid, newline)])

172

"""

173

while lines:

174

header = lines.pop(0)

175

start, end, c = [int(n) for n in header.split(',')]

176

contents = []

177

for i in range(c):

178

origin, text = lines.pop(0).split(' ', 1)

179

contents.append((origin.decode('utf-8'), text))

180

yield start, end, c, contents

181

182

def parse_line_delta(self, lines, version):

183

return list(self.parse_line_delta_iter(lines))

184

185

def lower_fulltext(self, content):

186

"""convert a fulltext content record into a serializable form.

187

188

see parse_fulltext which this inverts.

189

"""

190

return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]

191

192

def lower_line_delta(self, delta):

193

"""convert a delta into a serializable form.

194

195

See parse_line_delta_iter which this inverts.

196

"""

197

out = []

198

for start, end, c, lines in delta:

199

out.append('%d,%d,%d\n' % (start, end, c))

200

for origin, text in lines:

201

out.append('%s %s' % (origin.encode('utf-8'), text))

202

return out

203

204

205

class KnitPlainFactory(_KnitFactory):

206

"""Factory for creating plain Content objects."""

207

208

annotated = False

209

210

def parse_fulltext(self, content, version):

211

"""This parses an unannotated fulltext.

212

213

Note that this is not a noop - the internal representation

214

has (versionid, line) - its just a constant versionid.

215

"""

216

return self.make(content, version)

217

218

def parse_line_delta_iter(self, lines, version):

219

while lines:

220

header = lines.pop(0)

221

start, end, c = [int(n) for n in header.split(',')]

222

yield start, end, c, zip([version] * c, lines[:c])

223

del lines[:c]

224

225

def parse_line_delta(self, lines, version):

226

return list(self.parse_line_delta_iter(lines, version))

227

228

def lower_fulltext(self, content):

229

return content.text()

230

231

def lower_line_delta(self, delta):

232

out = []

233

for start, end, c, lines in delta:

234

out.append('%d,%d,%d\n' % (start, end, c))

235

out.extend([text for origin, text in lines])

236

return out

237

238

239

def make_empty_knit(transport, relpath):

240

"""Construct a empty knit at the specified location."""

241

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

242

k._data._open_file()

243

244

245

class KnitVersionedFile(VersionedFile):

246

"""Weave-like structure with faster random access.

247

248

A knit stores a number of texts and a summary of the relationships

249

between them. Texts are identified by a string version-id. Texts

250

are normally stored and retrieved as a series of lines, but can

251

also be passed as single strings.

252

253

Lines are stored with the trailing newline (if any) included, to

254

avoid special cases for files with no final newline. Lines are

255

composed of 8-bit characters, not unicode. The combination of

256

these approaches should mean any 'binary' file can be safely

257

stored and retrieved.

258

"""

259

260

def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,

261

basis_knit=None, delta=True, create=False):

262

"""Construct a knit at location specified by relpath.

263

264

:param create: If not True, only open an existing knit.

265

"""

266

if access_mode is None:

267

access_mode = 'w'

268

super(KnitVersionedFile, self).__init__(access_mode)

269

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

270

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

271

type(basis_knit)

272

273

self.transport = transport

274

self.filename = relpath

275

self.basis_knit = basis_knit

276

self.factory = factory or KnitAnnotateFactory()

277

self.writable = (access_mode == 'w')

278

self.delta = delta

279

280

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

281

access_mode, create=create)

282

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

283

access_mode, create=not len(self.versions()))

284

285

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

286

"""See VersionedFile._add_delta()."""

287

self._check_add(version_id, []) # should we check the lines ?

288

self._check_versions_present(parents)

289

present_parents = []

290

ghosts = []

291

parent_texts = {}

292

for parent in parents:

293

if not self.has_version(parent):

294

ghosts.append(parent)

295

else:

296

present_parents.append(parent)

297

298

if delta_parent is None:

299

# reconstitute as full text.

300

assert len(delta) == 1 or len(delta) == 0

301

if len(delta):

302

assert delta[0][0] == 0

303

assert delta[0][1] == 0, delta[0][1]

304

return super(KnitVersionedFile, self)._add_delta(version_id,

305

parents,

306

delta_parent,

307

sha1,

308

noeol,

309

delta)

310

311

digest = sha1

312

313

options = []

314

if noeol:

315

options.append('no-eol')

316

317

if delta_parent is not None:

318

# determine the current delta chain length.

319

# To speed the extract of texts the delta chain is limited

320

# to a fixed number of deltas. This should minimize both

321

# I/O and the time spend applying deltas.

322

count = 0

323

delta_parents = [delta_parent]

324

while count < 25:

325

parent = delta_parents[0]

326

method = self._index.get_method(parent)

327

if method == 'fulltext':

328

break

329

delta_parents = self._index.get_parents(parent)

330

count = count + 1

331

if method == 'line-delta':

332

# did not find a fulltext in the delta limit.

333

# just do a normal insertion.

334

return super(KnitVersionedFile, self)._add_delta(version_id,

335

parents,

336

delta_parent,

337

sha1,

338

noeol,

339

delta)

340

341

options.append('line-delta')

342

store_lines = self.factory.lower_line_delta(delta)

343

344

where, size = self._data.add_record(version_id, digest, store_lines)

345

self._index.add_version(version_id, options, where, size, parents)

346

347

def clear_cache(self):

348

"""Clear the data cache only."""

349

self._data.clear_cache()

350

351

def copy_to(self, name, transport):

352

"""See VersionedFile.copy_to()."""

353

# copy the current index to a temp index to avoid racing with local

354

# writes

355

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))

356

# copy the data file

357

transport.put(name + DATA_SUFFIX, self._data._open_file())

358

# rename the copied index into place

359

transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

360

361

def create_empty(self, name, transport, mode=None):

362

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

363

364

def _fix_parents(self, version, new_parents):

365

"""Fix the parents list for version.

366

367

This is done by appending a new version to the index

368

with identical data except for the parents list.

369

the parents list must be a superset of the current

370

list.

371

"""

372

current_values = self._index._cache[version]

373

assert set(current_values[4]).difference(set(new_parents)) == set()

374

self._index.add_version(version,

375

current_values[1],

376

current_values[2],

377

current_values[3],

378

new_parents)

379

380

def get_delta(self, version_id):

381

"""Get a delta for constructing version from some other version."""

382

if not self.has_version(version_id):

383

raise RevisionNotPresent(version_id, self.filename)

384

385

parents = self.get_parents(version_id)

386

if len(parents):

387

parent = parents[0]

388

else:

389

parent = None

390

data_pos, data_size = self._index.get_position(version_id)

391

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

392

version_idx = self._index.lookup(version_id)

393

noeol = 'no-eol' in self._index.get_options(version_id)

394

if 'fulltext' == self._index.get_method(version_id):

395

new_content = self.factory.parse_fulltext(data, version_idx)

396

if parent is not None:

397

reference_content = self._get_content(parent)

398

old_texts = reference_content.text()

399

else:

400

old_texts = []

401

new_texts = new_content.text()

402

delta_seq = SequenceMatcher(None, old_texts, new_texts)

403

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

404

else:

405

delta = self.factory.parse_line_delta(data, version_idx)

406

return parent, sha1, noeol, delta

407

408

def get_graph_with_ghosts(self):

409

"""See VersionedFile.get_graph_with_ghosts()."""

410

graph_items = self._index.get_graph()

411

return dict(graph_items)

412

413

@staticmethod

414

def get_suffixes():

415

"""See VersionedFile.get_suffixes()."""

416

return [DATA_SUFFIX, INDEX_SUFFIX]

417

418

def has_ghost(self, version_id):

419

"""True if there is a ghost reference in the file to version_id."""

420

# maybe we have it

421

if self.has_version(version_id):

422

return False

423

# optimisable if needed by memoising the _ghosts set.

424

items = self._index.get_graph()

425

for node, parents in items:

426

for parent in parents:

427

if parent not in self._index._cache:

428

if parent == version_id:

429

return True

430

return False

431

432

def versions(self):

433

"""See VersionedFile.versions."""

434

return self._index.get_versions()

435

436

def has_version(self, version_id):

437

"""See VersionedFile.has_version."""

438

return self._index.has_version(version_id)

439

440

__contains__ = has_version

441

442

def _merge_annotations(self, content, parents, parent_texts={},

443

delta=None, annotated=None):

444

"""Merge annotations for content. This is done by comparing

445

the annotations based on changed to the text.

446

"""

447

if annotated:

448

delta_seq = None

449

for parent_id in parents:

450

merge_content = self._get_content(parent_id, parent_texts)

451

seq = SequenceMatcher(None, merge_content.text(), content.text())

452

if delta_seq is None:

453

# setup a delta seq to reuse.

454

delta_seq = seq

455

for i, j, n in seq.get_matching_blocks():

456

if n == 0:

457

continue

458

# this appears to copy (origin, text) pairs across to the new

459

# content for any line that matches the last-checked parent.

460

# FIXME: save the sequence control data for delta compression

461

# against the most relevant parent rather than rediffing.

462

content._lines[j:j+n] = merge_content._lines[i:i+n]

463

if delta:

464

if not annotated:

465

reference_content = self._get_content(parents[0], parent_texts)

466

new_texts = content.text()

467

old_texts = reference_content.text()

468

delta_seq = SequenceMatcher(None, old_texts, new_texts)

469

return self._make_line_delta(delta_seq, content)

470

471

def _make_line_delta(self, delta_seq, new_content):

472

"""Generate a line delta from delta_seq and new_content."""

473

diff_hunks = []

474

for op in delta_seq.get_opcodes():

475

if op[0] == 'equal':

476

continue

477

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

478

return diff_hunks

479

480

def _get_components(self, version_id):

481

"""Return a list of (version_id, method, data) tuples that

482

makes up version specified by version_id of the knit.

483

484

The components should be applied in the order of the returned

485

list.

486

487

The basis knit will be used to the largest extent possible

488

since it is assumed that accesses to it is faster.

489

"""

490

# needed_revisions holds a list of (method, version_id) of

491

# versions that is needed to be fetched to construct the final

492

# version of the file.

493

494

# basis_revisions is a list of versions that needs to be

495

# fetched but exists in the basis knit.

496

497

basis = self.basis_knit

498

needed_versions = []

499

basis_versions = []

500

cursor = version_id

501

502

while 1:

503

picked_knit = self

504

if basis and basis._index.has_version(cursor):

505

picked_knit = basis

506

basis_versions.append(cursor)

507

method = picked_knit._index.get_method(cursor)

508

needed_versions.append((method, cursor))

509

if method == 'fulltext':

510

break

511

cursor = picked_knit.get_parents(cursor)[0]

512

513

components = {}

514

if basis_versions:

515

records = []

516

for comp_id in basis_versions:

517

data_pos, data_size = basis._index.get_data_position(comp_id)

518

records.append((piece_id, data_pos, data_size))

519

components.update(basis._data.read_records(records))

520

521

records = []

522

for comp_id in [vid for method, vid in needed_versions

523

if vid not in basis_versions]:

524

data_pos, data_size = self._index.get_position(comp_id)

525

records.append((comp_id, data_pos, data_size))

526

components.update(self._data.read_records(records))

527

528

# get_data_records returns a mapping with the version id as

529

# index and the value as data. The order the components need

530

# to be applied is held by needed_versions (reversed).

531

out = []

532

for method, comp_id in reversed(needed_versions):

533

out.append((comp_id, method, components[comp_id]))

534

535

return out

536

537

def _get_content(self, version_id, parent_texts={}):

538

"""Returns a content object that makes up the specified

539

version."""

540

if not self.has_version(version_id):

541

raise RevisionNotPresent(version_id, self.filename)

542

543

cached_version = parent_texts.get(version_id, None)

544

if cached_version is not None:

545

return cached_version

546

547

if self.basis_knit and version_id in self.basis_knit:

548

return self.basis_knit._get_content(version_id)

549

550

content = None

551

components = self._get_components(version_id)

552

for component_id, method, (data, digest) in components:

553

version_idx = self._index.lookup(component_id)

554

if method == 'fulltext':

555

assert content is None

556

content = self.factory.parse_fulltext(data, version_idx)

557

elif method == 'line-delta':

558

delta = self.factory.parse_line_delta(data, version_idx)

559

content._lines = self._apply_delta(content._lines, delta)

560

561

if 'no-eol' in self._index.get_options(version_id):

562

line = content._lines[-1][1].rstrip('\n')

563

content._lines[-1] = (content._lines[-1][0], line)

564

565

if sha_strings(content.text()) != digest:

566

import pdb;pdb.set_trace()

567

raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)

568

569

return content

570

571

def _check_versions_present(self, version_ids):

572

"""Check that all specified versions are present."""

573

version_ids = set(version_ids)

574

for r in list(version_ids):

575

if self._index.has_version(r):

576

version_ids.remove(r)

577

if version_ids:

578

raise RevisionNotPresent(list(version_ids)[0], self.filename)

579

580

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

581

"""See VersionedFile.add_lines_with_ghosts()."""

582

self._check_add(version_id, lines)

583

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

584

585

def _add_lines(self, version_id, parents, lines, parent_texts):

586

"""See VersionedFile.add_lines."""

587

self._check_add(version_id, lines)

588

self._check_versions_present(parents)

589

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

590

591

def _check_add(self, version_id, lines):

592

"""check that version_id and lines are safe to add."""

593

assert self.writable, "knit is not opened for write"

594

### FIXME escape. RBC 20060228

595

if contains_whitespace(version_id):

596

raise InvalidRevisionId(version_id)

597

if self.has_version(version_id):

598

raise RevisionAlreadyPresent(version_id, self.filename)

599

600

if False or __debug__:

601

for l in lines:

602

assert '\n' not in l[:-1]

603

604

def _add(self, version_id, lines, parents, delta, parent_texts):

605

"""Add a set of lines on top of version specified by parents.

606

607

If delta is true, compress the text as a line-delta against

608

the first parent.

609

610

Any versions not present will be converted into ghosts.

611

"""

612

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

613

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

614

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

615

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

616

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

617

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

618

# +1383 0 8.0370 8.0370 +<len>

619

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

620

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

621

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

622

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

623

624

present_parents = []

625

ghosts = []

626

if parent_texts is None:

627

parent_texts = {}

628

for parent in parents:

629

if not self.has_version(parent):

630

ghosts.append(parent)

631

else:

632

present_parents.append(parent)

633

634

if delta and not len(present_parents):

635

delta = False

636

637

digest = sha_strings(lines)

638

options = []

639

if lines:

640

if lines[-1][-1] != '\n':

641

options.append('no-eol')

642

lines[-1] = lines[-1] + '\n'

643

644

if len(present_parents) and delta:

645

# To speed the extract of texts the delta chain is limited

646

# to a fixed number of deltas. This should minimize both

647

# I/O and the time spend applying deltas.

648

count = 0

649

delta_parents = present_parents

650

while count < 25:

651

parent = delta_parents[0]

652

method = self._index.get_method(parent)

653

if method == 'fulltext':

654

break

655

delta_parents = self._index.get_parents(parent)

656

count = count + 1

657

if method == 'line-delta':

658

delta = False

659

660

lines = self.factory.make(lines, version_id)

661

if delta or (self.factory.annotated and len(present_parents) > 0):

662

# Merge annotations from parent texts if so is needed.

663

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

664

delta, self.factory.annotated)

665

666

if delta:

667

options.append('line-delta')

668

store_lines = self.factory.lower_line_delta(delta_hunks)

669

else:

670

options.append('fulltext')

671

store_lines = self.factory.lower_fulltext(lines)

672

673

where, size = self._data.add_record(version_id, digest, store_lines)

674

self._index.add_version(version_id, options, where, size, parents)

675

return lines

676

677

def check(self, progress_bar=None):

678

"""See VersionedFile.check()."""

679

680

def _clone_text(self, new_version_id, old_version_id, parents):

681

"""See VersionedFile.clone_text()."""

682

# FIXME RBC 20060228 make fast by only inserting an index with null delta.

683

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

684

685

def get_lines(self, version_id):

686

"""See VersionedFile.get_lines()."""

687

return self._get_content(version_id).text()

688

689

def iter_lines_added_or_present_in_versions(self, version_ids=None):

690

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

691

if version_ids is None:

692

version_ids = self.versions()

693

# we dont care about inclusions, the caller cares.

694

# but we need to setup a list of records to visit.

695

# we need version_id, position, length

696

version_id_records = []

697

requested_versions = list(version_ids)

698

# filter for available versions

699

for version_id in requested_versions:

700

if not self.has_version(version_id):

701

raise RevisionNotPresent(version_id, self.filename)

702

# get a in-component-order queue:

703

version_ids = []

704

for version_id in self.versions():

705

if version_id in requested_versions:

706

version_ids.append(version_id)

707

data_pos, length = self._index.get_position(version_id)

708

version_id_records.append((version_id, data_pos, length))

709

710

pb = bzrlib.ui.ui_factory.nested_progress_bar()

711

count = 0

712

total = len(version_id_records)

713

try:

714

pb.update('Walking content.', count, total)

715

for version_id, data, sha_value in \

716

self._data.read_records_iter(version_id_records):

717

pb.update('Walking content.', count, total)

718

method = self._index.get_method(version_id)

719

version_idx = self._index.lookup(version_id)

720

assert method in ('fulltext', 'line-delta')

721

if method == 'fulltext':

722

content = self.factory.parse_fulltext(data, version_idx)

723

for line in content.text():

724

yield line

725

else:

726

delta = self.factory.parse_line_delta(data, version_idx)

727

for start, end, count, lines in delta:

728

for origin, line in lines:

729

yield line

730

count +=1

731

pb.update('Walking content.', total, total)

732

pb.finished()

733

except:

734

pb.update('Walking content.', total, total)

735

pb.finished()

736

raise

737

738

def num_versions(self):

739

"""See VersionedFile.num_versions()."""

740

return self._index.num_versions()

741

742

__len__ = num_versions

743

744

def annotate_iter(self, version_id):

745

"""See VersionedFile.annotate_iter."""

746

content = self._get_content(version_id)

747

for origin, text in content.annotate_iter():

748

yield origin, text

749

750

def get_parents(self, version_id):

751

"""See VersionedFile.get_parents."""

752

self._check_versions_present([version_id])

753

return list(self._index.get_parents(version_id))

754

755

def get_parents_with_ghosts(self, version_id):

756

"""See VersionedFile.get_parents."""

757

self._check_versions_present([version_id])

758

return list(self._index.get_parents_with_ghosts(version_id))

759

760

def get_ancestry(self, versions):

761

"""See VersionedFile.get_ancestry."""

762

if isinstance(versions, basestring):

763

versions = [versions]

764

if not versions:

765

return []

766

self._check_versions_present(versions)

767

return self._index.get_ancestry(versions)

768

769

def get_ancestry_with_ghosts(self, versions):

770

"""See VersionedFile.get_ancestry_with_ghosts."""

771

if isinstance(versions, basestring):

772

versions = [versions]

773

if not versions:

774

return []

775

self._check_versions_present(versions)

776

return self._index.get_ancestry_with_ghosts(versions)

777

778

#@deprecated_method(zero_eight)

779

def walk(self, version_ids):

780

"""See VersionedFile.walk."""

781

# We take the short path here, and extract all relevant texts

782

# and put them in a weave and let that do all the work. Far

783

# from optimal, but is much simpler.

784

# FIXME RB 20060228 this really is inefficient!

785

from bzrlib.weave import Weave

786

787

w = Weave(self.filename)

788

ancestry = self.get_ancestry(version_ids)

789

sorted_graph = topo_sort(self._index.get_graph())

790

version_list = [vid for vid in sorted_graph if vid in ancestry]

791

792

for version_id in version_list:

793

lines = self.get_lines(version_id)

794

w.add_lines(version_id, self.get_parents(version_id), lines)

795

796

for lineno, insert_id, dset, line in w.walk(version_ids):

797

yield lineno, insert_id, dset, line

798

799

800

class _KnitComponentFile(object):

801

"""One of the files used to implement a knit database"""

802

803

def __init__(self, transport, filename, mode):

804

self._transport = transport

805

self._filename = filename

806

self._mode = mode

807

808

def write_header(self):

809

if self._transport.append(self._filename, StringIO(self.HEADER)):

810

raise KnitCorrupt(self._filename, 'misaligned after writing header')

811

812

def check_header(self, fp):

813

line = fp.read(len(self.HEADER))

814

if line != self.HEADER:

815

raise KnitHeaderError(badline=line)

816

817

def commit(self):

818

"""Commit is a nop."""

819

820

def __repr__(self):

821

return '%s(%s)' % (self.__class__.__name__, self._filename)

822

823

824

class _KnitIndex(_KnitComponentFile):

825

"""Manages knit index file.

826

827

The index is already kept in memory and read on startup, to enable

828

fast lookups of revision information. The cursor of the index

829

file is always pointing to the end, making it easy to append

830

entries.

831

832

_cache is a cache for fast mapping from version id to a Index

833

object.

834

835

_history is a cache for fast mapping from indexes to version ids.

836

837

The index data format is dictionary compressed when it comes to

838

parent references; a index entry may only have parents that with a

839

lover index number. As a result, the index is topological sorted.

840

841

Duplicate entries may be written to the index for a single version id

842

if this is done then the latter one completely replaces the former:

843

this allows updates to correct version and parent information.

844

Note that the two entries may share the delta, and that successive

845

annotations and references MUST point to the first entry.

846

"""

847

848

HEADER = "# bzr knit index 7\n"

849

850

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

851

# __slots__ = ['_cache', '_history', '_transport', '_filename']

852

853

def _cache_version(self, version_id, options, pos, size, parents):

854

"""Cache a version record in the history array and index cache.

855

856

This is inlined into __init__ for performance. KEEP IN SYNC.

857

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

858

indexes).

859

"""

860

# only want the _history index to reference the 1st index entry

861

# for version_id

862

if version_id not in self._cache:

863

self._history.append(version_id)

864

self._cache[version_id] = (version_id, options, pos, size, parents)

865

866

def __init__(self, transport, filename, mode, create=False):

867

_KnitComponentFile.__init__(self, transport, filename, mode)

868

self._cache = {}

869

# position in _history is the 'official' index for a revision

870

# but the values may have come from a newer entry.

871

# so - wc -l of a knit index is != the number of uniqe names

872

# in the weave.

873

self._history = []

874

pb = bzrlib.ui.ui_factory.nested_progress_bar()

875

try:

876

count = 0

877

total = 1

878

try:

879

pb.update('read knit index', count, total)

880

fp = self._transport.get(self._filename)

881

self.check_header(fp)

882

# readlines reads the whole file at once:

883

# bad for transports like http, good for local disk

884

# we save 60 ms doing this one change (

885

# from calling readline each time to calling

886

# readlines once.

887

# probably what we want for nice behaviour on

888

# http is a incremental readlines that yields, or

889

# a check for local vs non local indexes,

890

for l in fp.readlines():

891

rec = l.split()

892

count += 1

893

total += 1

894

#pb.update('read knit index', count, total)

895

# See self._parse_parents

896

parents = []

897

for value in rec[4:]:

898

if '.' == value[0]:

899

# uncompressed reference

900

parents.append(value[1:])

901

else:

902

# this is 15/4000ms faster than isinstance,

903

# (in lsprof)

904

# this function is called thousands of times a

905

# second so small variations add up.

906

assert value.__class__ is str

907

parents.append(self._history[int(value)])

908

# end self._parse_parents

909

# self._cache_version(rec[0],

910

# rec[1].split(','),

911

# int(rec[2]),

912

# int(rec[3]),

913

# parents)

914

# --- self._cache_version

915

# only want the _history index to reference the 1st

916

# index entry for version_id

917

version_id = rec[0]

918

if version_id not in self._cache:

919

self._history.append(version_id)

920

self._cache[version_id] = (version_id,

921

rec[1].split(','),

922

int(rec[2]),

923

int(rec[3]),

924

parents)

925

# --- self._cache_version

926

except NoSuchFile, e:

927

if mode != 'w' or not create:

928

raise

929

self.write_header()

930

finally:

931

pb.update('read knit index', total, total)

932

pb.finished()

933

934

def _parse_parents(self, compressed_parents):

935

"""convert a list of string parent values into version ids.

936

937

ints are looked up in the index.

938

.FOO values are ghosts and converted in to FOO.

939

940

NOTE: the function is retained here for clarity, and for possible

941

use in partial index reads. However bulk processing now has

942

it inlined in __init__ for inner-loop optimisation.

943

"""

944

result = []

945

for value in compressed_parents:

946

if value[-1] == '.':

947

# uncompressed reference

948

result.append(value[1:])

949

else:

950

# this is 15/4000ms faster than isinstance,

951

# this function is called thousands of times a

952

# second so small variations add up.

953

assert value.__class__ is str

954

result.append(self._history[int(value)])

955

return result

956

957

def get_graph(self):

958

graph = []

959

for version_id, index in self._cache.iteritems():

960

graph.append((version_id, index[4]))

961

return graph

962

963

def get_ancestry(self, versions):

964

"""See VersionedFile.get_ancestry."""

965

# get a graph of all the mentioned versions:

966

graph = {}

967

pending = set(versions)

968

while len(pending):

969

version = pending.pop()

970

parents = self._cache[version][4]

971

# got the parents ok

972

# trim ghosts

973

parents = [parent for parent in parents if parent in self._cache]

974

for parent in parents:

975

# if not completed and not a ghost

976

if parent not in graph:

977

pending.add(parent)

978

graph[version] = parents

979

return topo_sort(graph.items())

980

981

def get_ancestry_with_ghosts(self, versions):

982

"""See VersionedFile.get_ancestry_with_ghosts."""

983

# get a graph of all the mentioned versions:

984

graph = {}

985

pending = set(versions)

986

while len(pending):

987

version = pending.pop()

988

try:

989

parents = self._cache[version][4]

990

except KeyError:

991

# ghost, fake it

992

graph[version] = []

993

pass

994

else:

995

# got the parents ok

996

for parent in parents:

997

if parent not in graph:

998

pending.add(parent)

999

graph[version] = parents

1000

return topo_sort(graph.items())

1001

1002

def num_versions(self):

1003

return len(self._history)

1004

1005

__len__ = num_versions

1006

1007

def get_versions(self):

1008

return self._history

1009

1010

def idx_to_name(self, idx):

1011

return self._history[idx]

1012

1013

def lookup(self, version_id):

1014

assert version_id in self._cache

1015

return self._history.index(version_id)

1016

1017

def _version_list_to_index(self, versions):

1018

result_list = []

1019

for version in versions:

1020

if version in self._cache:

1021

result_list.append(str(self._history.index(version)))

1022

else:

1023

result_list.append('.' + version.encode('utf-8'))

1024

return ' '.join(result_list)

1025

1026

def add_version(self, version_id, options, pos, size, parents):

1027

"""Add a version record to the index."""

1028

self._cache_version(version_id, options, pos, size, parents)

1029

1030

content = "%s %s %s %s %s\n" % (version_id.encode('utf-8'),

1031

','.join(options),

1032

pos,

1033

size,

1034

self._version_list_to_index(parents))

1035

assert isinstance(content, str), 'content must be utf-8 encoded'

1036

self._transport.append(self._filename, StringIO(content))

1037

1038

def has_version(self, version_id):

1039

"""True if the version is in the index."""

1040

return self._cache.has_key(version_id)

1041

1042

def get_position(self, version_id):

1043

"""Return data position and size of specified version."""

1044

return (self._cache[version_id][2], \

1045

self._cache[version_id][3])

1046

1047

def get_method(self, version_id):

1048

"""Return compression method of specified version."""

1049

options = self._cache[version_id][1]

1050

if 'fulltext' in options:

1051

return 'fulltext'

1052

else:

1053

assert 'line-delta' in options

1054

return 'line-delta'

1055

1056

def get_options(self, version_id):

1057

return self._cache[version_id][1]

1058

1059

def get_parents(self, version_id):

1060

"""Return parents of specified version ignoring ghosts."""

1061

return [parent for parent in self._cache[version_id][4]

1062

if parent in self._cache]

1063

1064

def get_parents_with_ghosts(self, version_id):

1065

"""Return parents of specified version wth ghosts."""

1066

return self._cache[version_id][4]

1067

1068

def check_versions_present(self, version_ids):

1069

"""Check that all specified versions are present."""

1070

version_ids = set(version_ids)

1071

for version_id in list(version_ids):

1072

if version_id in self._cache:

1073

version_ids.remove(version_id)

1074

if version_ids:

1075

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1076

1077

1078

class _KnitData(_KnitComponentFile):

1079

"""Contents of the knit data file"""

1080

1081

HEADER = "# bzr knit data 7\n"

1082

1083

def __init__(self, transport, filename, mode, create=False):

1084

_KnitComponentFile.__init__(self, transport, filename, mode)

1085

self._file = None

1086

self._checked = False

1087

if create:

1088

self._transport.put(self._filename, StringIO(''))

1089

self._records = {}

1090

1091

def clear_cache(self):

1092

"""Clear the record cache."""

1093

self._records = {}

1094

1095

def _open_file(self):

1096

if self._file is None:

1097

try:

1098

self._file = self._transport.get(self._filename)

1099

except NoSuchFile:

1100

pass

1101

return self._file

1102

1103

def _record_to_data(self, version_id, digest, lines):

1104

"""Convert version_id, digest, lines into a raw data block.

1105

1106

:return: (len, a StringIO instance with the raw data ready to read.)

1107

"""

1108

sio = StringIO()

1109

data_file = GzipFile(None, mode='wb', fileobj=sio)

1110

data_file.writelines(chain(

1111

["version %s %d %s\n" % (version_id.encode('utf-8'),

1112

len(lines),

1113

digest)],

1114

lines,

1115

["end %s\n\n" % version_id.encode('utf-8')]))

1116

data_file.close()

1117

length= sio.tell()

1118

1119

sio.seek(0)

1120

return length, sio

1121

1122

def add_raw_record(self, raw_data):

1123

"""Append a prepared record to the data file."""

1124

assert isinstance(raw_data, str), 'data must be plain bytes'

1125

start_pos = self._transport.append(self._filename, StringIO(raw_data))

1126

return start_pos, len(raw_data)

1127

1128

def add_record(self, version_id, digest, lines):

1129

"""Write new text record to disk. Returns the position in the

1130

file where it was written."""

1131

size, sio = self._record_to_data(version_id, digest, lines)

1132

# cache

1133

self._records[version_id] = (digest, lines)

1134

# write to disk

1135

start_pos = self._transport.append(self._filename, sio)

1136

return start_pos, size

1137

1138

def _parse_record_header(self, version_id, raw_data):

1139

"""Parse a record header for consistency.

1140

1141

:return: the header and the decompressor stream.

1142

as (stream, header_record)

1143

"""

1144

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1145

rec = df.readline().split()

1146

if len(rec) != 4:

1147

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1148

if rec[1].decode('utf-8')!= version_id:

1149

raise KnitCorrupt(self._filename,

1150

'unexpected version, wanted %r, got %r' % (

1151

version_id, rec[1]))

1152

return df, rec

1153

1154

def _parse_record(self, version_id, data):

1155

df, rec = self._parse_record_header(version_id, data)

1156

lines = int(rec[2])

1157

record_contents = self._read_record_contents(df, lines)

1158

l = df.readline()

1159

if l.decode('utf-8') != 'end %s\n' % version_id:

1160

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1161

% (l, version_id))

1162

df.close()

1163

return record_contents, rec[3]

1164

1165

def _read_record_contents(self, df, record_lines):

1166

"""Read and return n lines from datafile."""

1167

r = []

1168

for i in range(record_lines):

1169

r.append(df.readline())

1170

return r

1171

1172

def read_records_iter_raw(self, records):

1173

"""Read text records from data file and yield raw data.

1174

1175

This unpacks enough of the text record to validate the id is

1176

as expected but thats all.

1177

1178

It will actively recompress currently cached records on the

1179

basis that that is cheaper than I/O activity.

1180

"""

1181

needed_records = []

1182

for version_id, pos, size in records:

1183

if version_id not in self._records:

1184

needed_records.append((version_id, pos, size))

1185

1186

# setup an iterator of the external records:

1187

# uses readv so nice and fast we hope.

1188

if len(needed_records):

1189

# grab the disk data needed.

1190

raw_records = self._transport.readv(self._filename,

1191

[(pos, size) for version_id, pos, size in needed_records])

1192

1193

for version_id, pos, size in records:

1194

if version_id in self._records:

1195

# compress a new version

1196

size, sio = self._record_to_data(version_id,

1197

self._records[version_id][0],

1198

self._records[version_id][1])

1199

yield version_id, sio.getvalue()

1200

else:

1201

pos, data = raw_records.next()

1202

# validate the header

1203

df, rec = self._parse_record_header(version_id, data)

1204

df.close()

1205

yield version_id, data

1206

1207

1208

def read_records_iter(self, records):

1209

"""Read text records from data file and yield result.

1210

1211

Each passed record is a tuple of (version_id, pos, len) and

1212

will be read in the given order. Yields (version_id,

1213

contents, digest).

1214

"""

1215

1216

needed_records = []

1217

for version_id, pos, size in records:

1218

if version_id not in self._records:

1219

needed_records.append((version_id, pos, size))

1220

1221

if len(needed_records):

1222

# We take it that the transport optimizes the fetching as good

1223

# as possible (ie, reads continous ranges.)

1224

response = self._transport.readv(self._filename,

1225

[(pos, size) for version_id, pos, size in needed_records])

1226

1227

for (record_id, pos, size), (pos, data) in izip(iter(needed_records), response):

1228

content, digest = self._parse_record(record_id, data)

1229

self._records[record_id] = (digest, content)

1230

1231

for version_id, pos, size in records:

1232

yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])

1233

1234

def read_records(self, records):

1235

"""Read records into a dictionary."""

1236

components = {}

1237

for record_id, content, digest in self.read_records_iter(records):

1238

components[record_id] = (content, digest)

1239

return components

1240

1241

1242

class InterKnit(InterVersionedFile):

1243

"""Optimised code paths for knit to knit operations."""

1244

1245

_matching_file_factory = KnitVersionedFile

1246

1247

@staticmethod

1248

def is_compatible(source, target):

1249

"""Be compatible with knits. """

1250

try:

1251

return (isinstance(source, KnitVersionedFile) and

1252

isinstance(target, KnitVersionedFile))

1253

except AttributeError:

1254

return False

1255

1256

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1257

"""See InterVersionedFile.join."""

1258

assert isinstance(self.source, KnitVersionedFile)

1259

assert isinstance(self.target, KnitVersionedFile)

1260

1261

if version_ids is None:

1262

version_ids = self.source.versions()

1263

else:

1264

if not ignore_missing:

1265

self.source._check_versions_present(version_ids)

1266

else:

1267

version_ids = set(self.source.versions()).intersection(

1268

set(version_ids))

1269

1270

if not version_ids:

1271

return 0

1272

1273

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1274

try:

1275

version_ids = list(version_ids)

1276

if None in version_ids:

1277

version_ids.remove(None)

1278

1279

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1280

this_versions = set(self.target._index.get_versions())

1281

needed_versions = self.source_ancestry - this_versions

1282

cross_check_versions = self.source_ancestry.intersection(this_versions)

1283

mismatched_versions = set()

1284

for version in cross_check_versions:

1285

# scan to include needed parents.

1286

n1 = set(self.target.get_parents_with_ghosts(version))

1287

n2 = set(self.source.get_parents_with_ghosts(version))

1288

if n1 != n2:

1289

# FIXME TEST this check for cycles being introduced works

1290

# the logic is we have a cycle if in our graph we are an

1291

# ancestor of any of the n2 revisions.

1292

for parent in n2:

1293

if parent in n1:

1294

# safe

1295

continue

1296

else:

1297

parent_ancestors = self.source.get_ancestry(parent)

1298

if version in parent_ancestors:

1299

raise errors.GraphCycleError([parent, version])

1300

# ensure this parent will be available later.

1301

new_parents = n2.difference(n1)

1302

needed_versions.update(new_parents.difference(this_versions))

1303

mismatched_versions.add(version)

1304

1305

if not needed_versions and not cross_check_versions:

1306

return 0

1307

full_list = topo_sort(self.source.get_graph())

1308

1309

version_list = [i for i in full_list if (not self.target.has_version(i)

1310

and i in needed_versions)]

1311

1312

# plan the join:

1313

copy_queue = []

1314

copy_queue_records = []

1315

copy_set = set()

1316

for version_id in version_list:

1317

options = self.source._index.get_options(version_id)

1318

parents = self.source._index.get_parents_with_ghosts(version_id)

1319

# check that its will be a consistent copy:

1320

for parent in parents:

1321

# if source has the parent, we must :

1322

# * already have it or

1323

# * have it scheduled already

1324

# otherwise we dont care

1325

assert (self.target.has_version(parent) or

1326

parent in copy_set or

1327

not self.source.has_version(parent))

1328

data_pos, data_size = self.source._index.get_position(version_id)

1329

copy_queue_records.append((version_id, data_pos, data_size))

1330

copy_queue.append((version_id, options, parents))

1331

copy_set.add(version_id)

1332

1333

# data suck the join:

1334

count = 0

1335

total = len(version_list)

1336

# we want the raw gzip for bulk copying, but the record validated

1337

# just enough to be sure its the right one.

1338

# TODO: consider writev or write combining to reduce

1339

# death of a thousand cuts feeling.

1340

for (version_id, raw_data), \

1341

(version_id2, options, parents) in \

1342

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1343

copy_queue):

1344

assert version_id == version_id2, 'logic error, inconsistent results'

1345

count = count + 1

1346

pb.update("Joining knit", count, total)

1347

pos, size = self.target._data.add_raw_record(raw_data)

1348

self.target._index.add_version(version_id, options, pos, size, parents)

1349

1350

for version in mismatched_versions:

1351

# FIXME RBC 20060309 is this needed?

1352

n1 = set(self.target.get_parents_with_ghosts(version))

1353

n2 = set(self.source.get_parents_with_ghosts(version))

1354

# write a combined record to our history preserving the current

1355

# parents as first in the list

1356

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1357

self.target.fix_parents(version, new_parents)

1358

return count

1359

finally:

1360

pb.finished()

1361

1362

1363

InterVersionedFile.register_optimiser(InterKnit)

1364

1365

1366

# make GzipFile faster:

1367

import zlib

1368

class GzipFile(gzip.GzipFile):

1369

"""Knit tuned version of GzipFile.

1370

1371

This is based on the following lsprof stats:

1372

python 2.4 stock GzipFile write:

1373

58971 0 5644.3090 2721.4730 gzip:193(write)

1374

+58971 0 1159.5530 1159.5530 +<built-in method compress>

1375

+176913 0 987.0320 987.0320 +<len>

1376

+58971 0 423.1450 423.1450 +<zlib.crc32>

1377

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

1378

StringO' objects>

1379

tuned GzipFile write:

1380

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

1381

+58971 0 1297.7620 1297.7620 +<built-in method compress>

1382

+58971 0 406.2160 406.2160 +<zlib.crc32>

1383

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

1384

StringO' objects>

1385

+58971 0 328.2670 328.2670 +<len>

1386

1387

1388

Yes, its only 1.6 seconds, but they add up.

1389

"""

1390

1391

def write(self, data):

1392

if self.mode != gzip.WRITE:

1393

import errno

1394

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

1395

1396

if self.fileobj is None:

1397

raise ValueError, "write() on closed GzipFile object"

1398

data_len = len(data)

1399

if data_len > 0:

1400

self.size = self.size + data_len

1401

self.crc = zlib.crc32(data, self.crc)

1402

self.fileobj.write( self.compress.compress(data) )

1403

self.offset += data_len

1404

1405

def writelines(self, lines):

1406

# profiling indicated a significant overhead

1407

# calling write for each line.

1408

# this batch call is a lot faster :).

1409

# (4 seconds to 1 seconds for the sample upgrades I was testing).

1410

self.write(''.join(lines))

1411

1412

1413

class SequenceMatcher(difflib.SequenceMatcher):

1414

"""Knit tuned sequence matcher.

1415

1416

This is based on profiling of difflib which indicated some improvements

1417

for our usage pattern.

1418

"""

1419

1420

def find_longest_match(self, alo, ahi, blo, bhi):

1421

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1422

1423

If isjunk is not defined:

1424

1425

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1426

alo <= i <= i+k <= ahi

1427

blo <= j <= j+k <= bhi

1428

and for all (i',j',k') meeting those conditions,

1429

k >= k'

1430

i <= i'

1431

and if i == i', j <= j'

1432

1433

In other words, of all maximal matching blocks, return one that

1434

starts earliest in a, and of all those maximal matching blocks that

1435

start earliest in a, return the one that starts earliest in b.

1436

1437

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1438

>>> s.find_longest_match(0, 5, 0, 9)

1439

(0, 4, 5)

1440

1441

If isjunk is defined, first the longest matching block is

1442

determined as above, but with the additional restriction that no

1443

junk element appears in the block. Then that block is extended as

1444

far as possible by matching (only) junk elements on both sides. So

1445

the resulting block never matches on junk except as identical junk

1446

happens to be adjacent to an "interesting" match.

1447

1448

Here's the same example as before, but considering blanks to be

1449

junk. That prevents " abcd" from matching the " abcd" at the tail

1450

end of the second sequence directly. Instead only the "abcd" can

1451

match, and matches the leftmost "abcd" in the second sequence:

1452

1453

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1454

>>> s.find_longest_match(0, 5, 0, 9)

1455

(1, 0, 4)

1456

1457

If no blocks match, return (alo, blo, 0).

1458

1459

>>> s = SequenceMatcher(None, "ab", "c")

1460

>>> s.find_longest_match(0, 2, 0, 1)

1461

(0, 0, 0)

1462

"""

1463

1464

# CAUTION: stripping common prefix or suffix would be incorrect.

1465

# E.g.,

1466

# ab

1467

# acab

1468

# Longest matching block is "ab", but if common prefix is

1469

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1470

# strip, so ends up claiming that ab is changed to acab by

1471

# inserting "ca" in the middle. That's minimal but unintuitive:

1472

# "it's obvious" that someone inserted "ac" at the front.

1473

# Windiff ends up at the same place as diff, but by pairing up

1474

# the unique 'b's and then matching the first two 'a's.

1475

1476

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1477

besti, bestj, bestsize = alo, blo, 0

1478

# find longest junk-free match

1479

# during an iteration of the loop, j2len[j] = length of longest

1480

# junk-free match ending with a[i-1] and b[j]

1481

j2len = {}

1482

# nothing = []

1483

b2jget = b2j.get

1484

for i in xrange(alo, ahi):

1485

# look at all instances of a[i] in b; note that because

1486

# b2j has no junk keys, the loop is skipped if a[i] is junk

1487

j2lenget = j2len.get

1488

newj2len = {}

1489

1490

# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the

1491

# following improvement

1492

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1493

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1494

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1495

# to

1496

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1497

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1498

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1499

1500

try:

1501

js = b2j[a[i]]

1502

except KeyError:

1503

pass

1504

else:

1505

for j in js:

1506

# a[i] matches b[j]

1507

if j >= blo:

1508

if j >= bhi:

1509

break

1510

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1511

if k > bestsize:

1512

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1513

j2len = newj2len

1514

1515

# Extend the best by non-junk elements on each end. In particular,

1516

# "popular" non-junk elements aren't in b2j, which greatly speeds

1517

# the inner loop above, but also means "the best" match so far

1518

# doesn't contain any junk *or* popular non-junk elements.

1519

while besti > alo and bestj > blo and \

1520

not isbjunk(b[bestj-1]) and \

1521

a[besti-1] == b[bestj-1]:

1522

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1523

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1524

not isbjunk(b[bestj+bestsize]) and \

1525

a[besti+bestsize] == b[bestj+bestsize]:

1526

bestsize += 1

1527

1528

# Now that we have a wholly interesting match (albeit possibly

1529

# empty!), we may as well suck up the matching junk on each

1530

# side of it too. Can't think of a good reason not to, and it

1531

# saves post-processing the (possibly considerable) expense of

1532

# figuring out what to do with it. In the case of an empty

1533

# interesting match, this is clearly the right thing to do,

1534

# because no other kind of match is possible in the regions.

1535

while besti > alo and bestj > blo and \

1536

isbjunk(b[bestj-1]) and \

1537

a[besti-1] == b[bestj-1]:

1538

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1539

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1540

isbjunk(b[bestj+bestsize]) and \

1541

a[besti+bestsize] == b[bestj+bestsize]:

1542

bestsize = bestsize + 1

1543

1544

return besti, bestj, bestsize

1545

Older »