~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to tools/history2weaves.py

Committer: Robert Collins
Date: 2005-08-25 12:46:42 UTC
mfrom: (1116)
mto: (974.1.50) (1185.1.10) (1092.3.1)
mto: This revision was merged to the branch mainline in revision 1139.
Revision ID: robertc@robertcollins.net-20050825124642-45ed1cd74db10370

merge from mpool

files added:
bzrlib/meta_store.py

bzrlib/plugins/checkperms

bzrlib/remotebranch.py

bzrlib/upgrade.py

patches/annotate3.patch

patches/annotate4.patch

patches/pending-merge.patch

patches/plugins-no-plugins.patch

patches/progress.diff

patches/symlink-support.patch

testsweet.py

files removed:
INSTALL

NEWS.developers

bzrlib/annotate.py

bzrlib/builtins.py

bzrlib/clone.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/externalcommand.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/identitymap.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/revisionspec.py

bzrlib/selftest/HTTPTestUtil.py

bzrlib/selftest/stub_sftp.py

bzrlib/selftest/test_ancestry.py

bzrlib/selftest/test_bad_files.py

bzrlib/selftest/test_command.py

bzrlib/selftest/test_commit.py

bzrlib/selftest/test_commit_merge.py

bzrlib/selftest/test_conflicts.py

bzrlib/selftest/test_parent.py

bzrlib/selftest/test_revision_info.py

bzrlib/selftest/test_upgrade.py

bzrlib/selftest/test_xml.py

bzrlib/selftest/testannotate.py

bzrlib/selftest/testapi.py

bzrlib/selftest/testconfig.py

bzrlib/selftest/testgpg.py

bzrlib/selftest/testgraph.py

bzrlib/selftest/testhttp.py

bzrlib/selftest/testidentitymap.py

bzrlib/selftest/testmerge.py

bzrlib/selftest/testnonascii.py

bzrlib/selftest/testoptions.py

bzrlib/selftest/testrevprops.py

bzrlib/selftest/testreweave.py

bzrlib/selftest/testsampler.py

bzrlib/selftest/testsftp.py

bzrlib/selftest/teststore.py

bzrlib/selftest/testtestament.py

bzrlib/selftest/testtransactions.py

bzrlib/selftest/testtransport.py

bzrlib/selftest/testtsort.py

bzrlib/selftest/testworkingtree.py

bzrlib/selftest/treeshape.py

bzrlib/store

bzrlib/store/compressed_text.py

bzrlib/store/text.py

bzrlib/store/weave.py

bzrlib/testament.py

bzrlib/transactions.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/tsort.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/win32console.py

bzrlib/xml4.py

bzrlib/xml5.py

patches/cache_weave_inclusions.diff

tools/capture_tree.py

files renamed:
contrib/newinventory.py => bzrlib/newinventory.py

bzrlib/selftest/testplugins.py => bzrlib/selftest/plugins.py

bzrlib/store/__init__.py => bzrlib/store.py

bzrlib/upgrade.py => tools/history2weaves.py

bzrlib/selftest/test_weave.py => tools/testweave.py

files modified:
.bzrignore

.rsyncexclude

HACKING

Makefile

NEWS

README

TODO

bzr *

bzr-man.py

bzrlib/__init__.py

bzrlib/add.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/changeset.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/lock.py

bzrlib/log.py

bzrlib/mdiff.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/merge_core.py

bzrlib/missing.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/selftest/__init__.py

bzrlib/selftest/blackbox.py

bzrlib/selftest/test_merge_core.py

bzrlib/selftest/test_smart_add.py

bzrlib/selftest/testbranch.py

bzrlib/selftest/testfetch.py

bzrlib/selftest/testhashcache.py

bzrlib/selftest/testinv.py

bzrlib/selftest/testlog.py

bzrlib/selftest/testmerge3.py

bzrlib/selftest/testrevision.py

bzrlib/selftest/testrevisionnamespaces.py

bzrlib/selftest/teststatus.py

bzrlib/selftest/versioning.py

bzrlib/selftest/whitebox.py

bzrlib/shellcomplete.py

bzrlib/status.py

bzrlib/textinv.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/weave.py *

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml.py

contrib/zsh/_bzr

doc/index.txt

doc/todo-from-arch.txt

setup.py *

tutorial.txt

Show diffs side-by-side

added added

removed removed

tools/history2weaves.py

#! /usr/bin/python

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Experiment in converting existing bzr branches to weaves."""

# To make this properly useful

# 1. assign text version ids, and put those text versions into

# the inventory as they're converted.

# 2. keep track of the previous version of each file, rather than

# just using the last one imported

# 3. assign entry versions when files are added, renamed or moved.

# 4. when merged-in versions are observed, walk down through them

# to discover everything, then commit bottom-up

# 5. track ancestry as things are merged in, and commit that in each

# revision

# Perhaps it's best to first walk the whole graph and make a plan for

# what should be imported in what order? Need a kind of topological

# sort of all revisions. (Or do we, can we just before doing a revision

# see that all its parents have either been converted or abandoned?)

# Cannot import a revision until all its parents have been

# imported. in other words, we can only import revisions whose

# parents have all been imported. the first step must be to

# import a revision with no parents, of which there must be at

# least one. (So perhaps it's useful to store forward pointers

# from a list of parents to their children?)

# Another (equivalent?) approach is to build up the ordered

# ancestry list for the last revision, and walk through that. We

# are going to need that.

# We don't want to have to recurse all the way back down the list.

# Suppose we keep a queue of the revisions able to be processed at

# any point. This starts out with all the revisions having no

# parents.

# This seems like a generally useful algorithm...

# The current algorithm is dumb (O(n**2)?) but will do the job, and

# takes less than a second on the bzr.dev branch.

# This currently does a kind of lazy conversion of file texts, where a

# new text is written in every version. That's unnecessary but for

# the moment saves us having to worry about when files need new

# versions.

# TODO: Don't create a progress bar here, have it passed by the caller.

# At least do it from the UI factory.

if False:

try:

import psyco

psyco.full()

except ImportError:

pass

import os

import tempfile

import sys

import logging

import shutil

from bzrlib.branch import Branch, find_branch

from bzrlib.branch import BZR_BRANCH_FORMAT_5, BZR_BRANCH_FORMAT_6

import bzrlib.hashcache as hashcache

try:

import psyco

psyco.full()

except ImportError:

pass

import bzrlib.branch

from bzrlib.revfile import Revfile

from bzrlib.weave import Weave

from bzrlib.weavefile import read_weave, write_weave

from bzrlib.ui import ui_factory

from bzrlib.progress import ProgressBar

from bzrlib.atomicfile import AtomicFile

from bzrlib.xml4 import serializer_v4

from bzrlib.xml5 import serializer_v5

from bzrlib.trace import mutter, note, warning, enable_default_logging

from bzrlib.osutils import sha_strings, sha_string

100

101

102

class Convert(object):

103

def __init__(self, base_dir):

104

self.base = base_dir

105

self.converted_revs = set()

106

self.absent_revisions = set()

107

self.text_count = 0

108

self.revisions = {}

109

self.convert()

110

111

112

def convert(self):

113

if not self._open_branch():

114

return

115

note('starting upgrade of %s', os.path.abspath(self.base))

116

self._backup_control_dir()

117

self.pb = ui_factory.progress_bar()

118

if self.old_format == 4:

119

note('starting upgrade from format 4 to 5')

120

self._convert_to_weaves()

121

self._open_branch()

122

if self.old_format == 5:

123

note('starting upgrade from format 5 to 6')

124

self._convert_to_prefixed()

125

self._open_branch()

126

cache = hashcache.HashCache(os.path.abspath(self.base))

127

cache.clear()

128

cache.write()

129

note("finished")

130

131

132

def _convert_to_prefixed(self):

133

from bzrlib.store import hash_prefix

134

for store_name in ["weaves", "revision-store"]:

135

note("adding prefixes to %s" % store_name)

136

store_dir = os.path.join(self.base, ".bzr", store_name)

137

for filename in os.listdir(store_dir):

138

if filename.endswith(".weave") or filename.endswith(".gz"):

139

file_id = os.path.splitext(filename)[0]

140

else:

141

file_id = filename

142

prefix_dir = os.path.join(store_dir, hash_prefix(file_id))

143

if not os.path.isdir(prefix_dir):

144

os.mkdir(prefix_dir)

145

os.rename(os.path.join(store_dir, filename),

146

os.path.join(prefix_dir, filename))

147

self._set_new_format(BZR_BRANCH_FORMAT_6)

148

149

150

def _convert_to_weaves(self):

151

note('note: upgrade may be faster if all store files are ungzipped first')

152

if not os.path.isdir(self.base + '/.bzr/weaves'):

153

os.mkdir(self.base + '/.bzr/weaves')

154

self.inv_weave = Weave('inventory')

155

# holds in-memory weaves for all files

156

self.text_weaves = {}

157

os.remove(self.branch.controlfilename('branch-format'))

158

self._convert_working_inv()

159

rev_history = self.branch.revision_history()

160

# to_read is a stack holding the revisions we still need to process;

161

# appending to it adds new highest-priority revisions

162

self.known_revisions = set(rev_history)

163

self.to_read = rev_history[-1:]

164

while self.to_read:

165

rev_id = self.to_read.pop()

166

if (rev_id not in self.revisions

167

and rev_id not in self.absent_revisions):

168

self._load_one_rev(rev_id)

169

self.pb.clear()

170

to_import = self._make_order()

171

for i, rev_id in enumerate(to_import):

172

self.pb.update('converting revision', i, len(to_import))

173

self._convert_one_rev(rev_id)

174

self.pb.clear()

175

note('upgraded to weaves:')

176

note(' %6d revisions and inventories' % len(self.revisions))

177

note(' %6d revisions not present' % len(self.absent_revisions))

178

note(' %6d texts' % self.text_count)

179

self._write_all_weaves()

180

self._write_all_revs()

181

self._cleanup_spare_files()

182

self._set_new_format(BZR_BRANCH_FORMAT_5)

183

184

185

def _open_branch(self):

186

self.branch = Branch.open_downlevel(self.base)

187

self.old_format = self.branch._branch_format

188

if self.old_format == 6:

189

note('this branch is in the most current format')

190

return False

191

if self.old_format not in (4, 5):

192

raise BzrError("cannot upgrade from branch format %r" %

193

self.branch._branch_format)

194

return True

195

196

197

def _set_new_format(self, format):

198

self.branch.put_controlfile('branch-format', format)

199

200

201

def _cleanup_spare_files(self):

202

for n in 'merged-patches', 'pending-merged-patches':

203

p = self.branch.controlfilename(n)

204

if not os.path.exists(p):

205

continue

206

## assert os.path.getsize(p) == 0

207

os.remove(p)

208

shutil.rmtree(self.base + '/.bzr/inventory-store')

209

shutil.rmtree(self.base + '/.bzr/text-store')

210

211

212

def _backup_control_dir(self):

213

orig = self.base + '/.bzr'

214

backup = orig + '.backup'

215

note('making backup of tree history')

216

shutil.copytree(orig, backup)

217

note('%s has been backed up to %s', orig, backup)

218

note('if conversion fails, you can move this directory back to .bzr')

219

note('if it succeeds, you can remove this directory if you wish')

220

221

222

def _convert_working_inv(self):

223

branch = self.branch

224

inv = serializer_v4.read_inventory(branch.controlfile('inventory', 'rb'))

225

new_inv_xml = serializer_v5.write_inventory_to_string(inv)

226

branch.put_controlfile('inventory', new_inv_xml)

227

228

229

230

def _write_all_weaves(self):

231

write_a_weave(self.inv_weave, self.base + '/.bzr/inventory.weave')

232

i = 0

233

try:

234

for file_id, file_weave in self.text_weaves.items():

235

self.pb.update('writing weave', i, len(self.text_weaves))

236

write_a_weave(file_weave, self.base + '/.bzr/weaves/%s.weave' % file_id)

237

i += 1

238

finally:

239

self.pb.clear()

240

241

242

def _write_all_revs(self):

243

"""Write all revisions out in new form."""

244

shutil.rmtree(self.base + '/.bzr/revision-store')

245

os.mkdir(self.base + '/.bzr/revision-store')

246

try:

247

for i, rev_id in enumerate(self.converted_revs):

248

self.pb.update('write revision', i, len(self.converted_revs))

249

f = file(self.base + '/.bzr/revision-store/%s' % rev_id, 'wb')

250

try:

251

serializer_v5.write_revision(self.revisions[rev_id], f)

252

finally:

253

f.close()

254

finally:

255

self.pb.clear()

256

257

258

def _load_one_rev(self, rev_id):

259

"""Load a revision object into memory.

260

261

Any parents not either loaded or abandoned get queued to be

262

loaded."""

263

self.pb.update('loading revision',

264

len(self.revisions),

265

len(self.known_revisions))

266

if not self.branch.revision_store.has_id(rev_id):

267

self.pb.clear()

268

note('revision {%s} not present in branch; '

269

'will be converted as a ghost',

270

rev_id)

271

self.absent_revisions.add(rev_id)

272

else:

273

rev_xml = self.branch.revision_store.get(rev_id).read()

274

rev = serializer_v4.read_revision_from_string(rev_xml)

275

for parent_id in rev.parent_ids:

276

self.known_revisions.add(parent_id)

277

self.to_read.append(parent_id)

278

self.revisions[rev_id] = rev

279

280

281

def _load_old_inventory(self, rev_id):

282

assert rev_id not in self.converted_revs

283

old_inv_xml = self.branch.inventory_store.get(rev_id).read()

284

inv = serializer_v4.read_inventory_from_string(old_inv_xml)

285

rev = self.revisions[rev_id]

286

if rev.inventory_sha1:

287

assert rev.inventory_sha1 == sha_string(old_inv_xml), \

288

'inventory sha mismatch for {%s}' % rev_id

289

return inv

import tempfile

import hotshot, hotshot.stats

import sys

def convert():

pb = ProgressBar()

inv_weave = Weave()

last_text_sha = {}

# holds in-memory weaves for all files

text_weaves = {}

b = bzrlib.branch.find_branch('.')

revno = 1

rev_history = b.revision_history()

last_idx = None

inv_parents = []

text_count = 0

for rev_id in rev_history:

pb.update('converting revision', revno, len(rev_history))

290

291

292

def _load_updated_inventory(self, rev_id):

293

assert rev_id in self.converted_revs

294

inv_xml = self.inv_weave.get_text(rev_id)

295

inv = serializer_v5.read_inventory_from_string(inv_xml)

296

return inv

297

298

299

def _convert_one_rev(self, rev_id):

300

"""Convert revision and all referenced objects to new format."""

301

rev = self.revisions[rev_id]

302

inv = self._load_old_inventory(rev_id)

303

present_parents = [p for p in rev.parent_ids

304

if p not in self.absent_revisions]

305

self._convert_revision_contents(rev, inv, present_parents)

306

self._store_new_weave(rev, inv, present_parents)

307

self.converted_revs.add(rev_id)

308

309

310

def _store_new_weave(self, rev, inv, present_parents):

311

# the XML is now updated with text versions

312

if __debug__:

313

for file_id in inv:

314

ie = inv[file_id]

315

if ie.kind == 'root_directory':

316

continue

317

assert hasattr(ie, 'revision'), \

318

'no revision on {%s} in {%s}' % \

319

(file_id, rev.revision_id)

320

new_inv_xml = serializer_v5.write_inventory_to_string(inv)

321

new_inv_sha1 = sha_string(new_inv_xml)

322

self.inv_weave.add(rev.revision_id,

323

present_parents,

324

new_inv_xml.splitlines(True),

325

new_inv_sha1)

326

rev.inventory_sha1 = new_inv_sha1

327

328

def _convert_revision_contents(self, rev, inv, present_parents):

329

"""Convert all the files within a revision.

330

331

Also upgrade the inventory to refer to the text revision ids."""

332

rev_id = rev.revision_id

333

mutter('converting texts of revision {%s}',

334

rev_id)

335

parent_invs = map(self._load_updated_inventory, present_parents)

inv_xml = b.get_inventory_xml(rev_id).readlines()

new_idx = inv_weave.add(rev_id, inv_parents, inv_xml)

inv_parents = [new_idx]

tree = b.revision_tree(rev_id)

inv = tree.inventory

# for each file in the inventory, put it into its own revfile

336

for file_id in inv:

337

ie = inv[file_id]

338

self._convert_file_version(rev, ie, parent_invs)

339

340

def _convert_file_version(self, rev, ie, parent_invs):

341

"""Convert one version of one file.

342

343

The file needs to be added into the weave if it is a merge

344

of >=2 parents or if it's changed from its parent.

345

"""

346

if ie.kind == 'root_directory':

347

return

348

file_id = ie.file_id

349

rev_id = rev.revision_id

350

w = self.text_weaves.get(file_id)

351

if w is None:

352

w = Weave(file_id)

353

self.text_weaves[file_id] = w

354

text_changed = False

355

previous_entries = ie.find_previous_heads(parent_invs, w)

356

for old_revision in previous_entries:

357

# if this fails, its a ghost ?

358

assert old_revision in self.converted_revs

359

self.snapshot_ie(previous_entries, ie, w, rev_id)

360

del ie.text_id

361

assert getattr(ie, 'revision', None) is not None

362

363

def snapshot_ie(self, previous_revisions, ie, w, rev_id):

364

# TODO: convert this logic, which is ~= snapshot to

365

# a call to:. This needs the path figured out. rather than a work_tree

366

# a v4 revision_tree can be given, or something that looks enough like

367

# one to give the file content to the entry if it needs it.

368

# and we need something that looks like a weave store for snapshot to

369

# save against.

370

#ie.snapshot(rev, PATH, previous_revisions, REVISION_TREE, InMemoryWeaveStore(self.text_weaves))

371

if len(previous_revisions) == 1:

372

previous_ie = previous_revisions.values()[0]

373

if ie._unchanged(previous_ie):

374

ie.revision = previous_ie.revision

375

return

376

parent_indexes = map(w.lookup, previous_revisions)

377

if ie.has_text():

378

file_lines = self.branch.text_store.get(ie.text_id).readlines()

379

assert sha_strings(file_lines) == ie.text_sha1

380

assert sum(map(len, file_lines)) == ie.text_size

381

w.add(rev_id, parent_indexes, file_lines, ie.text_sha1)

382

self.text_count += 1

383

else:

384

w.add(rev_id, parent_indexes, [], None)

385

ie.revision = rev_id

386

##mutter('import text {%s} of {%s}',

387

## ie.text_id, file_id)

388

389

def _make_order(self):

390

"""Return a suitable order for importing revisions.

391

392

The order must be such that an revision is imported after all

393

its (present) parents.

394

"""

395

todo = set(self.revisions.keys())

396

done = self.absent_revisions.copy()

397

o = []

398

while todo:

399

# scan through looking for a revision whose parents

400

# are all done

401

for rev_id in sorted(list(todo)):

402

rev = self.revisions[rev_id]

403

parent_ids = set(rev.parent_ids)

404

if parent_ids.issubset(done):

405

# can take this one now

406

o.append(rev_id)

407

todo.remove(rev_id)

408

done.add(rev_id)

409

return o

410

411

412

def write_a_weave(weave, filename):

413

inv_wf = file(filename, 'wb')

if ie.kind != 'file':

continue

if last_text_sha.get(file_id) == ie.text_sha1:

# same as last time

continue

last_text_sha[file_id] = ie.text_sha1

# new text (though possibly already stored); need to store it

text_lines = tree.get_file(file_id).readlines()

# if the file's created for the first time in this

# revision then make a new weave; else find the old one

if file_id not in text_weaves:

text_weaves[file_id] = Weave()

w = text_weaves[file_id]

# base the new text version off whatever was last

# (actually it'd be better to track this, to allow for

# files that are deleted and then reappear)

last = len(w)

if last == 0:

parents = []

else:

parents = [last-1]

w.add(rev_id, parents, text_lines)

text_count += 1

revno += 1

100

101

pb.clear()

102

print '%6d revisions and inventories' % revno

103

print '%6d texts' % text_count

104

105

i = 0

106

# TODO: commit them all atomically at the end, not one by one

107

write_atomic_weave(inv_weave, 'weaves/inventory.weave')

108

for file_id, file_weave in text_weaves.items():

109

pb.update('writing weave', i, len(text_weaves))

110

write_atomic_weave(file_weave, 'weaves/%s.weave' % file_id)

111

i += 1

112

113

pb.clear()

114

115

116

def write_atomic_weave(weave, filename):

117

inv_wf = AtomicFile(filename)

414

118

try:

415

119

write_weave(weave, inv_wf)

120

inv_wf.commit()

416

121

finally:

417

122

inv_wf.close()

418

123

419

420

def upgrade(base_dir):

421

Convert(base_dir)

124

125

126

127

def profile_convert():

128

prof_f = tempfile.NamedTemporaryFile()

129

130

prof = hotshot.Profile(prof_f.name)

131

132

prof.runcall(convert)

133

prof.close()

134

135

stats = hotshot.stats.load(prof_f.name)

136

#stats.strip_dirs()

137

stats.sort_stats('time')

138

## XXX: Might like to write to stderr or the trace file instead but

139

## print_stats seems hardcoded to stdout

140

stats.print_stats(20)

141

142

143

if '-p' in sys.argv[1:]:

144

profile_convert()

145

else:

146

convert()

147

Older »