~bzr-pqm/bzr/bzr.dev

Viewing changes to tools/history2weaves.py

Committer: Martin Pool
Date: 2005-08-19 22:41:41 UTC
Revision ID: mbp@sourcefrog.net-20050819224141-b43244067d36eee7

- add a tool script to convert past history into weaves

  this is not quite finished yet but does pretty well.
  we still need to upgrade the inventories as we go to make
  them store file revision_ids, so that we can get the right
  file back out of the weave.

  the results compare quite well: the history of bzr up to
  this point is 25688kB in the full-text store, gzipped, and
  3676kB in the weave store, and 1460kB in the weave store
  gzipped.  so about 20x compression before gzip.  a full
  working directory is 3084kB.

  using this we can get accurate fast annotations of all past
  versions, though you do need to map the weave version numbers
  back to revisions by hand for the moment.

files modified:
tools/history2weaves.py

Show diffs side-by-side

added added

removed removed

tools/history2weaves.py

inv_weave = Weave()

last_text_sha = {}

text_rfs = {}

# holds in-memory weaves for all files

text_weaves = {}

b = bzrlib.branch.find_branch('.')

revno = 1

rev_history = b.revision_history()

last_idx = None

parents = []

inv_parents = []

text_count = 0

for rev_id in rev_history:

pb.update('converting inventory', revno, len(rev_history))

inv_xml = b.get_inventory_xml(rev_id).readlines()

new_idx = inv_weave.add(rev_id, parents, inv_xml)

parents = [new_idx]

# tree = b.revision_tree(rev_id)

# inv = tree.inventory

# # for each file in the inventory, put it into its own revfile

# for file_id in inv:

# ie = inv[file_id]

# if ie.kind != 'file':

# continue

# if last_text_sha.get(file_id) == ie.text_sha1:

# # same as last time

# continue

# last_text_sha[file_id] = ie.text_sha1

# # new text (though possibly already stored); need to store it

# text = tree.get_file(file_id).read()

# if file_id not in text_rfs:

# text_rfs[file_id] = Revfile('revfiles/' + file_id, 'w')

# rf = text_rfs[file_id]

# last = len(rf)

# if last == 0:

# last = None

# else:

# last -= 1

# rf.add(text, last, compress=True)

new_idx = inv_weave.add(rev_id, inv_parents, inv_xml)

inv_parents = [new_idx]

tree = b.revision_tree(rev_id)

inv = tree.inventory

# for each file in the inventory, put it into its own revfile

for file_id in inv:

ie = inv[file_id]

if ie.kind != 'file':

continue

if last_text_sha.get(file_id) == ie.text_sha1:

# same as last time

continue

last_text_sha[file_id] = ie.text_sha1

# new text (though possibly already stored); need to store it

text_lines = tree.get_file(file_id).readlines()

# if the file's created for the first time in this

# revision then make a new weave; else find the old one

if file_id not in text_weaves:

text_weaves[file_id] = Weave()

w = text_weaves[file_id]

# base the new text version off whatever was last

# (actually it'd be better to track this, to allow for

# files that are deleted and then reappear)

last = len(w)

if last == 0:

parents = []

else:

parents = [last-1]

w.add(rev_id, parents, text_lines)

text_count += 1

revno += 1

100

inv_wf = AtomicFile('/tmp/inventory.weave')

101

pb.clear()

102

print '%6d revisions and inventories' % revno

103

print '%6d texts' % text_count

104

105

i = 0

106

# TODO: commit them all atomically at the end, not one by one

107

write_atomic_weave(inv_weave, 'weaves/inventory.weave')

108

for file_id, file_weave in text_weaves.items():

109

pb.update('writing weave', i, text_count)

110

write_atomic_weave(file_weave, 'weaves/%s.weave' % file_id)

111

i += 1

112

113

pb.clear()

114

115

116

def write_atomic_weave(weave, filename):

117

inv_wf = AtomicFile(filename)

118

try:

write_weave(inv_weave, inv_wf)

119

write_weave(weave, inv_wf)

120

inv_wf.commit()

121

finally:

122

inv_wf.close()

123

pb.clear()

124

125

126

100

127

def profile_convert():

Older »