3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# making _extract build and return a list, rather than being a generator
31
# with python -O, r923 does 2000 versions in 36.87s
33
# with optimizations to avoid mutating lists - 35.75! I guess copying
34
# all the elements every time costs more than the small manipulations.
35
# a surprisingly small change.
37
# r931, which avoids using a generator for extract, does 36.98s
39
# with memoized inclusions, takes 41.49s; not very good
41
# with slots, takes 37.35s; without takes 39.16, a bit surprising
43
# with the delta calculation mixed in with the add method, rather than
44
# separated, takes 36.78s
46
# with delta folded in and mutation of the list, 36.13s
48
# with all this and simplification of add code, 33s
51
# TODO: Perhaps have copy method for Weave instances?
53
# XXX: If we do weaves this way, will a merge still behave the same
54
# way if it's done in a different order? That's a pretty desirable
57
# TODO: Nothing here so far assumes the lines are really \n newlines,
58
# rather than being split up in some other way. We could accomodate
59
# binaries, perhaps by naively splitting on \n or perhaps using
60
# something like a rolling checksum.
62
# TODO: Track version names as well as indexes.
64
# TODO: End marker for each version so we can stop reading?
66
# TODO: Check that no insertion occurs inside a deletion that was
67
# active in the version of the insertion.
69
# TODO: In addition to the SHA-1 check, perhaps have some code that
70
# checks structural constraints of the weave: ie that insertions are
71
# properly nested, that there is no text outside of an insertion, that
72
# insertions or deletions are not repeated, etc.
74
# TODO: Make the info command just show info, not extract everything:
75
# it can be much faster.
77
# TODO: Perhaps use long integers as sets instead of set objects; may
80
# TODO: Parallel-extract that passes back each line along with a
81
# description of which revisions include it. Nice for checking all
87
class WeaveError(Exception):
88
"""Exception in processing weave"""
91
class WeaveFormatError(WeaveError):
92
"""Weave invariant violated"""
96
"""weave - versioned text file storage.
98
A Weave manages versions of line-based text files, keeping track
99
of the originating version for each line.
101
To clients the "lines" of the file are represented as a list of strings.
102
These strings will typically have terminal newline characters, but
103
this is not required. In particular files commonly do not have a newline
104
at the end of the file.
106
Texts can be identified in either of two ways:
108
* a nonnegative index number.
110
* a version-id string.
112
Typically the index number will be valid only inside this weave and
113
the version-id is used to reference it in the larger world.
115
The weave is represented as a list mixing edit instructions and
116
literal text. Each entry in _l can be either a string (or
117
unicode), or a tuple. If a string, it means that the given line
118
should be output in the currently active revisions.
120
If a tuple, it gives a processing instruction saying in which
121
revisions the enclosed lines are active. The tuple has the form
122
(instruction, version).
124
The instruction can be '{' or '}' for an insertion block, and '['
125
and ']' for a deletion block respectively. The version is the
126
integer version index. There is no replace operator, only deletes
131
* A later version can delete lines that were introduced by any
132
number of ancestor versions; this implies that deletion
133
instructions can span insertion blocks without regard to the
134
insertion block's nesting.
136
* Similarly, deletions need not be properly nested with regard to
137
each other, because they might have been generated by
138
independent revisions.
140
* Insertions are always made by inserting a new bracketed block
141
into a single point in the previous weave. This implies they
142
can nest but not overlap, and the nesting must always have later
143
insertions on the inside.
145
* It doesn't seem very useful to have an active insertion
146
inside an inactive insertion, but it might happen.
148
* Therefore, all instructions are always"considered"; that
149
is passed onto and off the stack. An outer inactive block
150
doesn't disable an inner block.
152
* Lines are enabled if the most recent enclosing insertion is
153
active and none of the enclosing deletions are active.
155
* There is no point having a deletion directly inside its own
156
insertion; you might as well just not write it. And there
157
should be no way to get an earlier version deleting a later
164
List of parents, indexed by version number.
165
It is only necessary to store the minimal set of parents for
166
each version; the parent's parents are implied.
169
List of hex SHA-1 of each version, or None if not recorded.
172
## __slots__ = ['_l', '_v', '_sha1s']
180
def __eq__(self, other):
181
if not isinstance(other, Weave):
183
return self._v == other._v \
184
and self._l == other._l
187
def __ne__(self, other):
188
return not self.__eq__(other)
191
def add(self, parents, text):
192
"""Add a single text on top of the weave.
194
Returns the index number of the newly added version.
197
List or set of direct parent version numbers.
200
Sequence of lines to be added in the new version."""
202
self._check_versions(parents)
203
## self._check_lines(text)
204
new_version = len(self._v)
212
# if we abort after here the weave will be corrupt
213
self._v.append(frozenset(parents))
214
self._sha1s.append(sha1)
218
# special case; adding with no parents revision; can do
219
# this more quickly by just appending unconditionally.
220
# even more specially, if we're adding an empty text we
221
# need do nothing at all.
223
self._l.append(('{', new_version))
225
self._l.append(('}', new_version))
229
if len(parents) == 1:
230
pv = list(parents)[0]
231
if sha1 == self._sha1s[pv]:
232
# special case: same as the single parent
236
ancestors = self.inclusions(parents)
240
# basis a list of (origin, lineno, line)
243
for origin, lineno, line in self._extract(ancestors):
244
basis_lineno.append(lineno)
245
basis_lines.append(line)
247
# another small special case: a merge, producing the same text as auto-merge
248
if text == basis_lines:
251
# add a sentinal, because we can also match against the final line
252
basis_lineno.append(len(self._l))
254
# XXX: which line of the weave should we really consider
255
# matches the end of the file? the current code says it's the
256
# last line of the weave?
258
#print 'basis_lines:', basis_lines
259
#print 'new_lines: ', lines
261
from difflib import SequenceMatcher
262
s = SequenceMatcher(None, basis_lines, text)
264
# offset gives the number of lines that have been inserted
265
# into the weave up to the current point; if the original edit instruction
266
# says to change line A then we actually change (A+offset)
269
for tag, i1, i2, j1, j2 in s.get_opcodes():
270
# i1,i2 are given in offsets within basis_lines; we need to map them
271
# back to offsets within the entire weave
272
#print 'raw match', tag, i1, i2, j1, j2
276
i1 = basis_lineno[i1]
277
i2 = basis_lineno[i2]
279
assert 0 <= j1 <= j2 <= len(text)
281
#print tag, i1, i2, j1, j2
283
# the deletion and insertion are handled separately.
284
# first delete the region.
286
self._l.insert(i1+offset, ('[', new_version))
287
self._l.insert(i2+offset+1, (']', new_version))
291
# there may have been a deletion spanning up to
292
# i2; we want to insert after this region to make sure
293
# we don't destroy ourselves
295
self._l[i:i] = ([('{', new_version)]
297
+ [('}', new_version)])
298
offset += 2 + (j2 - j1)
303
def inclusions(self, versions):
304
"""Return set of all ancestors of given version(s)."""
310
# include all its parents
315
raise ValueError("version %d not present in weave" % v)
318
def minimal_parents(self, version):
319
"""Find the minimal set of parents for the version."""
320
included = self._v[version]
325
li.sort(reverse=True)
333
gotit.update(self.inclusions(pv))
335
assert mininc[0] >= 0
336
assert mininc[-1] < version
341
def _check_lines(self, text):
342
if not isinstance(text, list):
343
raise ValueError("text should be a list, not %s" % type(text))
346
if not isinstance(l, basestring):
347
raise ValueError("text line should be a string or unicode, not %s"
352
def _check_versions(self, indexes):
353
"""Check everything in the sequence of indexes is valid"""
358
raise IndexError("invalid version number %r" % i)
361
def annotate(self, index):
362
return list(self.annotate_iter(index))
365
def annotate_iter(self, version):
366
"""Yield list of (index-id, line) pairs for the specified version.
368
The index indicates when the line originated in the weave."""
369
for origin, lineno, text in self._extract([version]):
377
(lineno, insert, deletes, text)
378
for each literal line.
384
lineno = 0 # line of weave, 0-based
387
if isinstance(l, tuple):
400
raise WeaveFormatError('unexpected instruction %r'
403
assert isinstance(l, basestring)
405
yield lineno, istack[-1], dset, l
410
def _extract(self, versions):
411
"""Yield annotation of lines in included set.
413
Yields a sequence of tuples (origin, lineno, text), where
414
origin is the origin version, lineno the index in the weave,
415
and text the text of the line.
417
The set typically but not necessarily corresponds to a version.
419
included = self.inclusions(versions)
424
lineno = 0 # line of weave, 0-based
430
WFE = WeaveFormatError
433
if isinstance(l, tuple):
437
assert v not in istack
452
assert isinstance(l, basestring)
454
isactive = (not dset) and istack and (istack[-1] in included)
456
result.append((istack[-1], lineno, l))
460
raise WFE("unclosed insertion blocks at end of weave",
463
raise WFE("unclosed deletion blocks at end of weave",
470
def get_iter(self, version):
471
"""Yield lines for the specified version."""
472
for origin, lineno, line in self._extract([version]):
476
def get(self, index):
477
return list(self.get_iter(index))
480
def mash_iter(self, included):
481
"""Return composed version of multiple included versions."""
482
for origin, lineno, text in self._extract(included):
486
def dump(self, to_file):
487
from pprint import pprint
488
print >>to_file, "Weave._l = ",
489
pprint(self._l, to_file)
490
print >>to_file, "Weave._v = ",
491
pprint(self._v, to_file)
495
def numversions(self):
497
assert l == len(self._sha1s)
501
def check(self, progress_bar=None):
502
# check no circular inclusions
503
for version in range(self.numversions()):
504
inclusions = list(self._v[version])
507
if inclusions[-1] >= version:
508
raise WeaveFormatError("invalid included version %d for index %d"
509
% (inclusions[-1], version))
511
# try extracting all versions; this is a bit slow and parallel
512
# extraction could be used
514
nv = self.numversions()
515
for version in range(nv):
517
progress_bar.update('checking text', version, nv)
519
for l in self.get_iter(version):
522
expected = self._sha1s[version]
524
raise WeaveError("mismatched sha1 for version %d; "
525
"got %s, expected %s"
526
% (version, hd, expected))
528
# TODO: check insertions are properly nested, that there are
529
# no lines outside of insertion blocks, that deletions are
530
# properly paired, etc.
534
def merge(self, merge_versions):
535
"""Automerge and mark conflicts between versions.
537
This returns a sequence, each entry describing alternatives
538
for a chunk of the file. Each of the alternatives is given as
541
If there is a chunk of the file where there's no diagreement,
542
only one alternative is given.
545
# approach: find the included versions common to all the
547
raise NotImplementedError()
551
def _delta(self, included, lines):
552
"""Return changes from basis to new revision.
554
The old text for comparison is the union of included revisions.
556
This is used in inserting a new text.
558
Delta is returned as a sequence of
559
(weave1, weave2, newlines).
561
This indicates that weave1:weave2 of the old weave should be
562
replaced by the sequence of lines in newlines. Note that
563
these line numbers are positions in the total weave and don't
564
correspond to the lines in any extracted version, or even the
565
extracted union of included versions.
567
If line1=line2, this is a pure insert; if newlines=[] this is a
568
pure delete. (Similar to difflib.)
573
def plan_merge(self, ver_a, ver_b):
574
"""Return pseudo-annotation indicating how the two versions merge.
576
This is computed between versions a and b and their common
579
Weave lines present in none of them are skipped entirely.
581
inc_a = self.inclusions([ver_a])
582
inc_b = self.inclusions([ver_b])
583
inc_c = inc_a & inc_b
585
for lineno, insert, deleteset, line in self._walk():
586
if deleteset & inc_c:
587
# killed in parent; can't be in either a or b
588
# not relevant to our work
589
yield 'killed-base', line
590
elif insert in inc_c:
591
# was inserted in base
592
killed_a = bool(deleteset & inc_a)
593
killed_b = bool(deleteset & inc_b)
594
if killed_a and killed_b:
595
yield 'killed-both', line
597
yield 'killed-a', line
599
yield 'killed-b', line
601
yield 'unchanged', line
602
elif insert in inc_a:
603
if deleteset & inc_a:
604
yield 'ghost-a', line
608
elif insert in inc_b:
609
if deleteset & inc_b:
610
yield 'ghost-b', line
614
# not in either revision
615
yield 'irrelevant', line
617
yield 'unchanged', '' # terminator
621
def weave_merge(self, plan):
626
for state, line in plan:
627
if state == 'unchanged' or state == 'killed-both':
628
# resync and flush queued conflicts changes if any
629
if not lines_a and not lines_b:
631
elif ch_a and not ch_b:
633
for l in lines_a: yield l
634
elif ch_b and not ch_a:
635
for l in lines_b: yield l
636
elif lines_a == lines_b:
637
for l in lines_a: yield l
640
for l in lines_a: yield l
642
for l in lines_b: yield l
649
if state == 'unchanged':
652
elif state == 'killed-a':
655
elif state == 'killed-b':
658
elif state == 'new-a':
661
elif state == 'new-b':
665
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
675
def weave_info(filename, out):
676
"""Show some text information about the weave."""
677
from weavefile import read_weave
678
wf = file(filename, 'rb')
680
# FIXME: doesn't work on pipes
681
weave_size = wf.tell()
682
print >>out, "weave file size %d bytes" % weave_size
683
print >>out, "weave contains %d versions" % len(w._v)
686
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
687
for i in (6, 6, 8, 40, 20):
690
for i in range(len(w._v)):
693
bytes = sum((len(a) for a in text))
695
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
701
print >>out, "versions total %d bytes" % total
702
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
706
print """bzr weave tool
708
Experimental tool for weave algorithm.
712
Create an empty weave file
713
weave get WEAVEFILE VERSION
714
Write out specified version.
715
weave check WEAVEFILE
716
Check consistency of all versions.
718
Display table of contents.
719
weave add WEAVEFILE [BASE...] < NEWTEXT
720
Add NEWTEXT, with specified parent versions.
721
weave annotate WEAVEFILE VERSION
722
Display origin of each line.
723
weave mash WEAVEFILE VERSION...
724
Display composite of all selected versions.
725
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
726
Auto-merge two versions and display conflicts.
730
% weave init foo.weave
732
% weave add foo.weave < foo.txt
735
(create updated version)
737
% weave get foo.weave 0 | diff -u - foo.txt
738
% weave add foo.weave 0 < foo.txt
741
% weave get foo.weave 0 > foo.txt (create forked version)
743
% weave add foo.weave 0 < foo.txt
746
% weave merge foo.weave 1 2 > foo.txt (merge them)
747
% vi foo.txt (resolve conflicts)
748
% weave add foo.weave 1 2 < foo.txt (commit merged version)
757
from weavefile import write_weave, read_weave
758
from bzrlib.progress import ProgressBar
766
return read_weave(file(argv[2], 'rb'))
772
# at the moment, based on everything in the file
773
parents = map(int, argv[3:])
774
lines = sys.stdin.readlines()
775
ver = w.add(parents, lines)
776
write_weave(w, file(argv[2], 'wb'))
777
print 'added version %d' % ver
780
if os.path.exists(fn):
781
raise IOError("file exists")
783
write_weave(w, file(fn, 'wb'))
784
elif cmd == 'get': # get one version
786
sys.stdout.writelines(w.get_iter(int(argv[3])))
788
elif cmd == 'mash': # get composite
790
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
792
elif cmd == 'annotate':
794
# newline is added to all lines regardless; too hard to get
795
# reasonable formatting otherwise
797
for origin, text in w.annotate(int(argv[3])):
798
text = text.rstrip('\r\n')
800
print ' | %s' % (text)
802
print '%5d | %s' % (origin, text)
806
weave_info(argv[2], sys.stdout)
813
print '%d versions ok' % w.numversions()
815
elif cmd == 'inclusions':
817
print ' '.join(map(str, w.inclusions([int(argv[3])])))
819
elif cmd == 'parents':
821
print ' '.join(map(str, w._v[int(argv[3])]))
823
elif cmd == 'plan-merge':
825
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
827
print '%14s | %s' % (state, line),
831
p = w.plan_merge(int(argv[3]), int(argv[4]))
832
sys.stdout.writelines(w.weave_merge(p))
834
elif cmd == 'mash-merge':
840
v1, v2 = map(int, argv[3:5])
842
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
844
base_lines = list(w.mash_iter(basis))
845
a_lines = list(w.get(v1))
846
b_lines = list(w.get(v2))
848
from bzrlib.merge3 import Merge3
849
m3 = Merge3(base_lines, a_lines, b_lines)
851
name_a = 'version %d' % v1
852
name_b = 'version %d' % v2
853
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
855
raise ValueError('unknown command %r' % cmd)
858
if __name__ == '__main__':
860
sys.exit(main(sys.argv))