3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# TODO: Perhaps have copy method for Weave instances?
26
# XXX: If we do weaves this way, will a merge still behave the same
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: Nothing here so far assumes the lines are really \n newlines,
31
# rather than being split up in some other way. We could accomodate
32
# binaries, perhaps by naively splitting on \n or perhaps using
33
# something like a rolling checksum.
35
# TODO: Track version names as well as indexes.
37
# TODO: End marker for each version so we can stop reading?
39
# TODO: Check that no insertion occurs inside a deletion that was
40
# active in the version of the insertion.
42
# TODO: In addition to the SHA-1 check, perhaps have some code that
43
# checks structural constraints of the weave: ie that insertions are
44
# properly nested, that there is no text outside of an insertion, that
45
# insertions or deletions are not repeated, etc.
47
# TODO: Make the info command just show info, not extract everything:
48
# it can be much faster.
50
# TODO: Perhaps use long integers as sets instead of set objects; may
53
# TODO: Parallel-extract that passes back each line along with a
54
# description of which revisions include it. Nice for checking all
64
from sets import Set, ImmutableSet
66
frozenset = ImmutableSet
70
class WeaveError(Exception):
71
"""Exception in processing weave"""
74
class WeaveFormatError(WeaveError):
75
"""Weave invariant violated"""
79
"""weave - versioned text file storage.
81
A Weave manages versions of line-based text files, keeping track
82
of the originating version for each line.
84
To clients the "lines" of the file are represented as a list of strings.
85
These strings will typically have terminal newline characters, but
86
this is not required. In particular files commonly do not have a newline
87
at the end of the file.
89
Texts can be identified in either of two ways:
91
* a nonnegative index number.
93
* a version-id string.
95
Typically the index number will be valid only inside this weave and
96
the version-id is used to reference it in the larger world.
98
The weave is represented as a list mixing edit instructions and
99
literal text. Each entry in _l can be either a string (or
100
unicode), or a tuple. If a string, it means that the given line
101
should be output in the currently active revisions.
103
If a tuple, it gives a processing instruction saying in which
104
revisions the enclosed lines are active. The tuple has the form
105
(instruction, version).
107
The instruction can be '{' or '}' for an insertion block, and '['
108
and ']' for a deletion block respectively. The version is the
109
integer version index. There is no replace operator, only deletes
114
* A later version can delete lines that were introduced by any
115
number of ancestor versions; this implies that deletion
116
instructions can span insertion blocks without regard to the
117
insertion block's nesting.
119
* Similarly, deletions need not be properly nested with regard to
120
each other, because they might have been generated by
121
independent revisions.
123
* Insertions are always made by inserting a new bracketed block
124
into a single point in the previous weave. This implies they
125
can nest but not overlap, and the nesting must always have later
126
insertions on the inside.
128
* It doesn't seem very useful to have an active insertion
129
inside an inactive insertion, but it might happen.
131
* Therefore, all instructions are always"considered"; that
132
is passed onto and off the stack. An outer inactive block
133
doesn't disable an inner block.
135
* Lines are enabled if the most recent enclosing insertion is
136
active and none of the enclosing deletions are active.
138
* There is no point having a deletion directly inside its own
139
insertion; you might as well just not write it. And there
140
should be no way to get an earlier version deleting a later
147
List of parents, indexed by version number.
148
It is only necessary to store the minimal set of parents for
149
each version; the parent's parents are implied.
152
List of hex SHA-1 of each version, or None if not recorded.
160
def __eq__(self, other):
161
if not isinstance(other, Weave):
163
return self._v == other._v \
164
and self._l == other._l
167
def __ne__(self, other):
168
return not self.__eq__(other)
171
def add(self, parents, text):
172
"""Add a single text on top of the weave.
174
Returns the index number of the newly added version.
177
List or set of direct parent version numbers.
180
Sequence of lines to be added in the new version."""
181
## self._check_versions(parents)
182
## self._check_lines(text)
192
# TODO: It'd probably be faster to append things on to a new
193
# list rather than modifying the existing one, which is likely
194
# to cause a lot of copying.
197
ancestors = self.inclusions(parents)
198
delta = self._delta(ancestors, text)
200
# offset gives the number of lines that have been inserted
201
# into the weave up to the current point; if the original edit instruction
202
# says to change line A then we actually change (A+offset)
205
for i1, i2, newlines in delta:
208
assert i2 <= len(self._l)
210
# the deletion and insertion are handled separately.
211
# first delete the region.
213
self._l.insert(i1+offset, ('[', idx))
214
self._l.insert(i2+offset+1, (']', idx))
219
# there may have been a deletion spanning up to
220
# i2; we want to insert after this region to make sure
221
# we don't destroy ourselves
223
self._l[i:i] = [('{', idx)] \
226
offset += 2 + len(newlines)
228
self._addversion(parents)
230
# special case; adding with no parents revision; can do this
231
# more quickly by just appending unconditionally
232
self._l.append(('{', idx))
234
self._l.append(('}', idx))
236
self._addversion(None)
238
self._sha1s.append(sha1)
243
def inclusions_bitset(self, versions):
250
# if v is included, include all its parents
251
for pv in self._v[v]:
257
def inclusions(self, versions):
258
"""Return set of all ancestors of given version(s)."""
264
# include all its parents
269
raise ValueError("version %d not present in weave" % v)
272
def minimal_parents(self, version):
273
"""Find the minimal set of parents for the version."""
274
included = self._v[version]
279
li.sort(reverse=True)
287
gotit.update(self.inclusions(pv))
289
assert mininc[0] >= 0
290
assert mininc[-1] < version
294
def _addversion(self, parents):
296
self._v.append(parents)
298
self._v.append(frozenset())
301
def _check_lines(self, text):
302
if not isinstance(text, list):
303
raise ValueError("text should be a list, not %s" % type(text))
306
if not isinstance(l, basestring):
307
raise ValueError("text line should be a string or unicode, not %s"
312
def _check_versions(self, indexes):
313
"""Check everything in the sequence of indexes is valid"""
318
raise IndexError("invalid version number %r" % i)
321
def annotate(self, index):
322
return list(self.annotate_iter(index))
325
def annotate_iter(self, version):
326
"""Yield list of (index-id, line) pairs for the specified version.
328
The index indicates when the line originated in the weave."""
329
for origin, lineno, text in self._extract([version]):
337
(lineno, insert, deletes, text)
338
for each literal line.
344
lineno = 0 # line of weave, 0-based
347
if isinstance(l, tuple):
356
assert not (dset & vs)
363
raise WeaveFormatError('unexpected instruction %r'
366
assert isinstance(l, basestring)
368
yield lineno, istack[-1], dset, l
373
def _extract(self, versions):
374
"""Yield annotation of lines in included set.
376
Yields a sequence of tuples (origin, lineno, text), where
377
origin is the origin version, lineno the index in the weave,
378
and text the text of the line.
380
The set typically but not necessarily corresponds to a version.
382
included = self.inclusions(versions)
387
lineno = 0 # line of weave, 0-based
391
WFE = WeaveFormatError
394
if isinstance(l, tuple):
398
assert v not in istack
413
assert isinstance(l, basestring)
415
isactive = (not dset) and istack and (istack[-1] in included)
417
yield istack[-1], lineno, l
421
raise WFE("unclosed insertion blocks at end of weave",
424
raise WFE("unclosed deletion blocks at end of weave",
428
def get_iter(self, version):
429
"""Yield lines for the specified version."""
430
for origin, lineno, line in self._extract([version]):
434
def get(self, index):
435
return list(self.get_iter(index))
438
def mash_iter(self, included):
439
"""Return composed version of multiple included versions."""
440
included = frozenset(included)
441
for origin, lineno, text in self._extract(included):
445
def dump(self, to_file):
446
from pprint import pprint
447
print >>to_file, "Weave._l = ",
448
pprint(self._l, to_file)
449
print >>to_file, "Weave._v = ",
450
pprint(self._v, to_file)
454
def numversions(self):
456
assert l == len(self._sha1s)
460
def check(self, progress_bar=None):
461
# check no circular inclusions
462
for version in range(self.numversions()):
463
inclusions = list(self._v[version])
466
if inclusions[-1] >= version:
467
raise WeaveFormatError("invalid included version %d for index %d"
468
% (inclusions[-1], version))
470
# try extracting all versions; this is a bit slow and parallel
471
# extraction could be used
473
nv = self.numversions()
474
for version in range(nv):
476
progress_bar.update('checking text', version, nv)
478
for l in self.get_iter(version):
481
expected = self._sha1s[version]
483
raise WeaveError("mismatched sha1 for version %d; "
484
"got %s, expected %s"
485
% (version, hd, expected))
487
# TODO: check insertions are properly nested, that there are
488
# no lines outside of insertion blocks, that deletions are
489
# properly paired, etc.
493
def merge(self, merge_versions):
494
"""Automerge and mark conflicts between versions.
496
This returns a sequence, each entry describing alternatives
497
for a chunk of the file. Each of the alternatives is given as
500
If there is a chunk of the file where there's no diagreement,
501
only one alternative is given.
504
# approach: find the included versions common to all the
506
raise NotImplementedError()
510
def _delta(self, included, lines):
511
"""Return changes from basis to new revision.
513
The old text for comparison is the union of included revisions.
515
This is used in inserting a new text.
517
Delta is returned as a sequence of
518
(weave1, weave2, newlines).
520
This indicates that weave1:weave2 of the old weave should be
521
replaced by the sequence of lines in newlines. Note that
522
these line numbers are positions in the total weave and don't
523
correspond to the lines in any extracted version, or even the
524
extracted union of included versions.
526
If line1=line2, this is a pure insert; if newlines=[] this is a
527
pure delete. (Similar to difflib.)
529
# basis a list of (origin, lineno, line)
532
for origin, lineno, line in self._extract(included):
533
basis_lineno.append(lineno)
534
basis_lines.append(line)
536
# add a sentinal, because we can also match against the final line
537
basis_lineno.append(len(self._l))
539
# XXX: which line of the weave should we really consider
540
# matches the end of the file? the current code says it's the
541
# last line of the weave?
543
from difflib import SequenceMatcher
544
s = SequenceMatcher(None, basis_lines, lines)
546
# TODO: Perhaps return line numbers from composed weave as well?
548
for tag, i1, i2, j1, j2 in s.get_opcodes():
549
##print tag, i1, i2, j1, j2
554
# i1,i2 are given in offsets within basis_lines; we need to map them
555
# back to offsets within the entire weave
556
real_i1 = basis_lineno[i1]
557
real_i2 = basis_lineno[i2]
561
assert j2 <= len(lines)
563
yield real_i1, real_i2, lines[j1:j2]
567
def plan_merge(self, ver_a, ver_b):
568
"""Return pseudo-annotation indicating how the two versions merge.
570
This is computed between versions a and b and their common
573
Weave lines present in none of them are skipped entirely.
575
inc_a = self.inclusions_bitset([ver_a])
576
inc_b = self.inclusions_bitset([ver_b])
577
inc_c = inc_a & inc_b
579
for lineno, insert, deleteset, line in self._walk():
580
insertset = (1L << insert)
581
if deleteset & inc_c:
582
# killed in parent; can't be in either a or b
583
# not relevant to our work
584
yield 'killed-base', line
585
elif insertset & inc_c:
586
# was inserted in base
587
killed_a = bool(deleteset & inc_a)
588
killed_b = bool(deleteset & inc_b)
589
if killed_a and killed_b:
590
yield 'killed-both', line
592
yield 'killed-a', line
594
yield 'killed-b', line
596
yield 'unchanged', line
597
elif insertset & inc_a:
598
if deleteset & inc_a:
599
yield 'ghost-a', line
603
elif insertset & inc_b:
604
if deleteset & inc_b:
605
yield 'ghost-b', line
609
# not in either revision
610
yield 'irrelevant', line
612
yield 'unchanged', '' # terminator
616
def weave_merge(self, plan):
621
for state, line in plan:
622
if state == 'unchanged' or state == 'killed-both':
623
# resync and flush queued conflicts changes if any
624
if not lines_a and not lines_b:
626
elif ch_a and not ch_b:
628
for l in lines_a: yield l
629
elif ch_b and not ch_a:
630
for l in lines_b: yield l
631
elif lines_a == lines_b:
632
for l in lines_a: yield l
635
for l in lines_a: yield l
637
for l in lines_b: yield l
644
if state == 'unchanged':
647
elif state == 'killed-a':
650
elif state == 'killed-b':
653
elif state == 'new-a':
656
elif state == 'new-b':
660
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
670
def weave_info(filename, out):
671
"""Show some text information about the weave."""
672
from weavefile import read_weave
673
wf = file(filename, 'rb')
675
# FIXME: doesn't work on pipes
676
weave_size = wf.tell()
677
print >>out, "weave file size %d bytes" % weave_size
678
print >>out, "weave contains %d versions" % len(w._v)
681
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
682
for i in (6, 6, 8, 40, 20):
685
for i in range(len(w._v)):
688
bytes = sum((len(a) for a in text))
690
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
696
print >>out, "versions total %d bytes" % total
697
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
701
print """bzr weave tool
703
Experimental tool for weave algorithm.
707
Create an empty weave file
708
weave get WEAVEFILE VERSION
709
Write out specified version.
710
weave check WEAVEFILE
711
Check consistency of all versions.
713
Display table of contents.
714
weave add WEAVEFILE [BASE...] < NEWTEXT
715
Add NEWTEXT, with specified parent versions.
716
weave annotate WEAVEFILE VERSION
717
Display origin of each line.
718
weave mash WEAVEFILE VERSION...
719
Display composite of all selected versions.
720
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
721
Auto-merge two versions and display conflicts.
725
% weave init foo.weave
727
% weave add foo.weave < foo.txt
730
(create updated version)
732
% weave get foo.weave 0 | diff -u - foo.txt
733
% weave add foo.weave 0 < foo.txt
736
% weave get foo.weave 0 > foo.txt (create forked version)
738
% weave add foo.weave 0 < foo.txt
741
% weave merge foo.weave 1 2 > foo.txt (merge them)
742
% vi foo.txt (resolve conflicts)
743
% weave add foo.weave 1 2 < foo.txt (commit merged version)
752
from weavefile import write_weave, read_weave
753
from bzrlib.progress import ProgressBar
761
return read_weave(file(argv[2], 'rb'))
767
# at the moment, based on everything in the file
768
parents = map(int, argv[3:])
769
lines = sys.stdin.readlines()
770
ver = w.add(parents, lines)
771
write_weave(w, file(argv[2], 'wb'))
772
print 'added version %d' % ver
775
if os.path.exists(fn):
776
raise IOError("file exists")
778
write_weave(w, file(fn, 'wb'))
779
elif cmd == 'get': # get one version
781
sys.stdout.writelines(w.get_iter(int(argv[3])))
783
elif cmd == 'mash': # get composite
785
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
787
elif cmd == 'annotate':
789
# newline is added to all lines regardless; too hard to get
790
# reasonable formatting otherwise
792
for origin, text in w.annotate(int(argv[3])):
793
text = text.rstrip('\r\n')
795
print ' | %s' % (text)
797
print '%5d | %s' % (origin, text)
801
weave_info(argv[2], sys.stdout)
809
elif cmd == 'inclusions':
811
print ' '.join(map(str, w.inclusions([int(argv[3])])))
813
elif cmd == 'parents':
815
print ' '.join(map(str, w._v[int(argv[3])]))
817
elif cmd == 'plan-merge':
819
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
821
print '%14s | %s' % (state, line),
825
p = w.plan_merge(int(argv[3]), int(argv[4]))
826
sys.stdout.writelines(w.weave_merge(p))
828
elif cmd == 'mash-merge':
834
v1, v2 = map(int, argv[3:5])
836
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
838
base_lines = list(w.mash_iter(basis))
839
a_lines = list(w.get(v1))
840
b_lines = list(w.get(v2))
842
from bzrlib.merge3 import Merge3
843
m3 = Merge3(base_lines, a_lines, b_lines)
845
name_a = 'version %d' % v1
846
name_b = 'version %d' % v2
847
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
849
raise ValueError('unknown command %r' % cmd)
852
if __name__ == '__main__':
854
sys.exit(main(sys.argv))