3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# TODO: Perhaps have copy method for Weave instances?
26
# XXX: If we do weaves this way, will a merge still behave the same
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: Nothing here so far assumes the lines are really \n newlines,
31
# rather than being split up in some other way. We could accomodate
32
# binaries, perhaps by naively splitting on \n or perhaps using
33
# something like a rolling checksum.
35
# TODO: Track version names as well as indexes.
37
# TODO: End marker for each version so we can stop reading?
39
# TODO: Check that no insertion occurs inside a deletion that was
40
# active in the version of the insertion.
42
# TODO: In addition to the SHA-1 check, perhaps have some code that
43
# checks structural constraints of the weave: ie that insertions are
44
# properly nested, that there is no text outside of an insertion, that
45
# insertions or deletions are not repeated, etc.
53
from sets import Set, ImmutableSet
55
frozenset = ImmutableSet
59
class WeaveError(Exception):
60
"""Exception in processing weave"""
63
class WeaveFormatError(WeaveError):
64
"""Weave invariant violated"""
68
"""weave - versioned text file storage.
70
A Weave manages versions of line-based text files, keeping track
71
of the originating version for each line.
73
To clients the "lines" of the file are represented as a list of strings.
74
These strings will typically have terminal newline characters, but
75
this is not required. In particular files commonly do not have a newline
76
at the end of the file.
78
Texts can be identified in either of two ways:
80
* a nonnegative index number.
82
* a version-id string.
84
Typically the index number will be valid only inside this weave and
85
the version-id is used to reference it in the larger world.
87
The weave is represented as a list mixing edit instructions and
88
literal text. Each entry in _l can be either a string (or
89
unicode), or a tuple. If a string, it means that the given line
90
should be output in the currently active revisions.
92
If a tuple, it gives a processing instruction saying in which
93
revisions the enclosed lines are active. The tuple has the form
94
(instruction, version).
96
The instruction can be '{' or '}' for an insertion block, and '['
97
and ']' for a deletion block respectively. The version is the
98
integer version index. There is no replace operator, only deletes
103
* A later version can delete lines that were introduced by any
104
number of ancestor versions; this implies that deletion
105
instructions can span insertion blocks without regard to the
106
insertion block's nesting.
108
* Similarly, deletions need not be properly nested with regard to
109
each other, because they might have been generated by
110
independent revisions.
112
* Insertions are always made by inserting a new bracketed block
113
into a single point in the previous weave. This implies they
114
can nest but not overlap, and the nesting must always have later
115
insertions on the inside.
117
* It doesn't seem very useful to have an active insertion
118
inside an inactive insertion, but it might happen.
120
* Therefore, all instructions are always"considered"; that
121
is passed onto and off the stack. An outer inactive block
122
doesn't disable an inner block.
124
* Lines are enabled if the most recent enclosing insertion is
125
active and none of the enclosing deletions are active.
127
* There is no point having a deletion directly inside its own
128
insertion; you might as well just not write it. And there
129
should be no way to get an earlier version deleting a later
136
List of parents, indexed by version number.
137
It is only necessary to store the minimal set of parents for
138
each version; the parent's parents are implied.
141
List of hex SHA-1 of each version, or None if not recorded.
149
def __eq__(self, other):
150
if not isinstance(other, Weave):
152
return self._v == other._v \
153
and self._l == other._l
156
def __ne__(self, other):
157
return not self.__eq__(other)
160
def add(self, parents, text):
161
"""Add a single text on top of the weave.
163
Returns the index number of the newly added version.
166
List or set of direct parent version numbers.
169
Sequence of lines to be added in the new version."""
170
## self._check_versions(parents)
171
## self._check_lines(text)
182
ancestors = self.inclusions(parents)
183
delta = self._delta(ancestors, text)
185
# offset gives the number of lines that have been inserted
186
# into the weave up to the current point; if the original edit instruction
187
# says to change line A then we actually change (A+offset)
190
for i1, i2, newlines in delta:
193
assert i2 <= len(self._l)
195
# the deletion and insertion are handled separately.
196
# first delete the region.
198
self._l.insert(i1+offset, ('[', idx))
199
self._l.insert(i2+offset+1, (']', idx))
204
# there may have been a deletion spanning up to
205
# i2; we want to insert after this region to make sure
206
# we don't destroy ourselves
208
self._l[i:i] = [('{', idx)] \
211
offset += 2 + len(newlines)
213
self._addversion(parents)
215
# special case; adding with no parents revision; can do this
216
# more quickly by just appending unconditionally
217
self._l.append(('{', idx))
219
self._l.append(('}', idx))
221
self._addversion(None)
223
self._sha1s.append(sha1)
228
def inclusions(self, versions):
229
"""Return set of all ancestors of given version(s)."""
235
# include all its parents
240
raise ValueError("version %d not present in weave" % v)
243
def minimal_parents(self, version):
244
"""Find the minimal set of parents for the version."""
245
included = self._v[version]
250
li.sort(reverse=True)
258
gotit.update(self.inclusions(pv))
260
assert mininc[0] >= 0
261
assert mininc[-1] < version
265
def _addversion(self, parents):
267
self._v.append(parents)
269
self._v.append(frozenset())
272
def _check_lines(self, text):
273
if not isinstance(text, list):
274
raise ValueError("text should be a list, not %s" % type(text))
277
if not isinstance(l, basestring):
278
raise ValueError("text line should be a string or unicode, not %s"
283
def _check_versions(self, indexes):
284
"""Check everything in the sequence of indexes is valid"""
289
raise IndexError("invalid version number %r" % i)
292
def annotate(self, index):
293
return list(self.annotate_iter(index))
296
def annotate_iter(self, version):
297
"""Yield list of (index-id, line) pairs for the specified version.
299
The index indicates when the line originated in the weave."""
300
for origin, lineno, text in self._extract([version]):
304
def _extract(self, versions):
305
"""Yield annotation of lines in included set.
307
Yields a sequence of tuples (origin, lineno, text), where
308
origin is the origin version, lineno the index in the weave,
309
and text the text of the line.
311
The set typically but not necessarily corresponds to a version.
313
included = self.inclusions(versions)
318
lineno = 0 # line of weave, 0-based
322
WFE = WeaveFormatError
325
if isinstance(l, tuple):
329
assert v not in istack
344
assert isinstance(l, basestring)
346
isactive = (not dset) and istack and (istack[-1] in included)
348
yield istack[-1], lineno, l
352
raise WFE("unclosed insertion blocks at end of weave",
355
raise WFE("unclosed deletion blocks at end of weave",
359
def get_iter(self, version):
360
"""Yield lines for the specified version."""
361
for origin, lineno, line in self._extract([version]):
365
def get(self, index):
366
return list(self.get_iter(index))
369
def mash_iter(self, included):
370
"""Return composed version of multiple included versions."""
371
included = frozenset(included)
372
for origin, lineno, text in self._extract(included):
376
def dump(self, to_file):
377
from pprint import pprint
378
print >>to_file, "Weave._l = ",
379
pprint(self._l, to_file)
380
print >>to_file, "Weave._v = ",
381
pprint(self._v, to_file)
385
def numversions(self):
387
assert l == len(self._sha1s)
391
def check(self, progress_bar=None):
392
# check no circular inclusions
393
for version in range(self.numversions()):
394
inclusions = list(self._v[version])
397
if inclusions[-1] >= version:
398
raise WeaveFormatError("invalid included version %d for index %d"
399
% (inclusions[-1], version))
401
# try extracting all versions; this is a bit slow and parallel
402
# extraction could be used
404
nv = self.numversions()
405
for version in range(nv):
407
progress_bar.update('checking text', version, nv)
409
for l in self.get_iter(version):
412
expected = self._sha1s[version]
414
raise WeaveError("mismatched sha1 for version %d; "
415
"got %s, expected %s"
416
% (version, hd, expected))
418
# TODO: check insertions are properly nested, that there are
419
# no lines outside of insertion blocks, that deletions are
420
# properly paired, etc.
424
def merge(self, merge_versions):
425
"""Automerge and mark conflicts between versions.
427
This returns a sequence, each entry describing alternatives
428
for a chunk of the file. Each of the alternatives is given as
431
If there is a chunk of the file where there's no diagreement,
432
only one alternative is given.
435
# approach: find the included versions common to all the
437
raise NotImplementedError()
441
def _delta(self, included, lines):
442
"""Return changes from basis to new revision.
444
The old text for comparison is the union of included revisions.
446
This is used in inserting a new text.
448
Delta is returned as a sequence of
449
(weave1, weave2, newlines).
451
This indicates that weave1:weave2 of the old weave should be
452
replaced by the sequence of lines in newlines. Note that
453
these line numbers are positions in the total weave and don't
454
correspond to the lines in any extracted version, or even the
455
extracted union of included versions.
457
If line1=line2, this is a pure insert; if newlines=[] this is a
458
pure delete. (Similar to difflib.)
460
# basis a list of (origin, lineno, line)
463
for origin, lineno, line in self._extract(included):
464
basis_lineno.append(lineno)
465
basis_lines.append(line)
467
# add a sentinal, because we can also match against the final line
468
basis_lineno.append(len(self._l))
470
# XXX: which line of the weave should we really consider
471
# matches the end of the file? the current code says it's the
472
# last line of the weave?
474
from difflib import SequenceMatcher
475
s = SequenceMatcher(None, basis_lines, lines)
477
# TODO: Perhaps return line numbers from composed weave as well?
479
for tag, i1, i2, j1, j2 in s.get_opcodes():
480
##print tag, i1, i2, j1, j2
485
# i1,i2 are given in offsets within basis_lines; we need to map them
486
# back to offsets within the entire weave
487
real_i1 = basis_lineno[i1]
488
real_i2 = basis_lineno[i2]
492
assert j2 <= len(lines)
494
yield real_i1, real_i2, lines[j1:j2]
498
def weave_info(filename, out):
499
"""Show some text information about the weave."""
500
from weavefile import read_weave
501
wf = file(filename, 'rb')
503
# FIXME: doesn't work on pipes
504
weave_size = wf.tell()
505
print >>out, "weave file size %d bytes" % weave_size
506
print >>out, "weave contains %d versions" % len(w._v)
509
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
510
for i in (6, 6, 8, 40, 20):
513
for i in range(len(w._v)):
516
bytes = sum((len(a) for a in text))
518
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
524
print >>out, "versions total %d bytes" % total
525
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
529
print """bzr weave tool
531
Experimental tool for weave algorithm.
535
Create an empty weave file
536
weave get WEAVEFILE VERSION
537
Write out specified version.
538
weave check WEAVEFILE
539
Check consistency of all versions.
541
Display table of contents.
542
weave add WEAVEFILE [BASE...] < NEWTEXT
543
Add NEWTEXT, with specified parent versions.
544
weave annotate WEAVEFILE VERSION
545
Display origin of each line.
546
weave mash WEAVEFILE VERSION...
547
Display composite of all selected versions.
548
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
549
Auto-merge two versions and display conflicts.
553
% weave init foo.weave
555
% weave add foo.weave < foo.txt
558
(create updated version)
560
% weave get foo.weave 0 | diff -u - foo.txt
561
% weave add foo.weave 0 < foo.txt
564
% weave get foo.weave 0 > foo.txt (create forked version)
566
% weave add foo.weave 0 < foo.txt
569
% weave merge foo.weave 1 2 > foo.txt (merge them)
570
% vi foo.txt (resolve conflicts)
571
% weave add foo.weave 1 2 < foo.txt (commit merged version)
580
from weavefile import write_weave, read_weave
581
from bzrlib.progress import ProgressBar
589
return read_weave(file(argv[2], 'rb'))
595
# at the moment, based on everything in the file
596
parents = map(int, argv[3:])
597
lines = sys.stdin.readlines()
598
ver = w.add(parents, lines)
599
write_weave(w, file(argv[2], 'wb'))
600
print 'added version %d' % ver
603
if os.path.exists(fn):
604
raise IOError("file exists")
606
write_weave(w, file(fn, 'wb'))
607
elif cmd == 'get': # get one version
609
sys.stdout.writelines(w.get_iter(int(argv[3])))
611
elif cmd == 'mash': # get composite
613
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
615
elif cmd == 'annotate':
617
# newline is added to all lines regardless; too hard to get
618
# reasonable formatting otherwise
620
for origin, text in w.annotate(int(argv[3])):
621
text = text.rstrip('\r\n')
623
print ' | %s' % (text)
625
print '%5d | %s' % (origin, text)
629
weave_info(argv[2], sys.stdout)
637
elif cmd == 'inclusions':
639
print ' '.join(map(str, w.inclusions([int(argv[3])])))
641
elif cmd == 'parents':
643
print ' '.join(map(str, w._v[int(argv[3])]))
651
v1, v2 = map(int, argv[3:5])
653
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
655
base_lines = list(w.mash_iter(basis))
656
a_lines = list(w.get(v1))
657
b_lines = list(w.get(v2))
659
from bzrlib.merge3 import Merge3
660
m3 = Merge3(base_lines, a_lines, b_lines)
662
name_a = 'version %d' % v1
663
name_b = 'version %d' % v2
664
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
666
raise ValueError('unknown command %r' % cmd)
669
if __name__ == '__main__':
671
sys.exit(main(sys.argv))