3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# TODO: Perhaps have copy method for Weave instances?
26
# XXX: If we do weaves this way, will a merge still behave the same
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: How to write these to disk? One option is cPickle, which
31
# would be fast but less friendly to C, and perhaps not portable. Another is
33
# TODO: Nothing here so far assumes the lines are really \n newlines,
34
# rather than being split up in some other way. We could accomodate
35
# binaries, perhaps by naively splitting on \n or perhaps using
36
# something like a rolling checksum.
38
# TODO: Perhaps track SHA-1 in the header for protection? This would
39
# be redundant with it being stored in the inventory, but perhaps
42
# TODO: Track version names as well as indexes.
44
# TODO: Probably do transitive expansion when specifying parents?
46
# TODO: Separate out some code to read and write weaves.
48
# TODO: End marker for each version so we can stop reading?
50
# TODO: Check that no insertion occurs inside a deletion that was
51
# active in the version of the insertion.
53
# TODO: Perhaps a special slower check() method that verifies more
54
# nesting constraints and the MD5 of each version?
62
from sets import Set, ImmutableSet
64
frozenset = ImmutableSet
68
class WeaveError(Exception):
69
"""Exception in processing weave"""
72
class WeaveFormatError(WeaveError):
73
"""Weave invariant violated"""
77
"""weave - versioned text file storage.
79
A Weave manages versions of line-based text files, keeping track
80
of the originating version for each line.
82
To clients the "lines" of the file are represented as a list of strings.
83
These strings will typically have terminal newline characters, but
84
this is not required. In particular files commonly do not have a newline
85
at the end of the file.
87
Texts can be identified in either of two ways:
89
* a nonnegative index number.
91
* a version-id string.
93
Typically the index number will be valid only inside this weave and
94
the version-id is used to reference it in the larger world.
96
The weave is represented as a list mixing edit instructions and
97
literal text. Each entry in _l can be either a string (or
98
unicode), or a tuple. If a string, it means that the given line
99
should be output in the currently active revisions.
101
If a tuple, it gives a processing instruction saying in which
102
revisions the enclosed lines are active. The tuple has the form
103
(instruction, version).
105
The instruction can be '{' or '}' for an insertion block, and '['
106
and ']' for a deletion block respectively. The version is the
107
integer version index. There is no replace operator, only deletes
112
* A later version can delete lines that were introduced by any
113
number of ancestor versions; this implies that deletion
114
instructions can span insertion blocks without regard to the
115
insertion block's nesting.
117
* Similarly, deletions need not be properly nested with regard to
118
each other, because they might have been generated by
119
independent revisions.
121
* Insertions are always made by inserting a new bracketed block
122
into a single point in the previous weave. This implies they
123
can nest but not overlap, and the nesting must always have later
124
insertions on the inside.
126
* It doesn't seem very useful to have an active insertion
127
inside an inactive insertion, but it might happen.
129
* Therefore, all instructions are always"considered"; that
130
is passed onto and off the stack. An outer inactive block
131
doesn't disable an inner block.
133
* Lines are enabled if the most recent enclosing insertion is
134
active and none of the enclosing deletions are active.
136
* There is no point having a deletion directly inside its own
137
insertion; you might as well just not write it. And there
138
should be no way to get an earlier version deleting a later
145
List of versions, indexed by index number.
147
For each version we store the set (included_versions), which
148
lists the previous versions also considered active; the
149
versions included in those versions are included transitively.
150
So new versions created from nothing list []; most versions
151
have a single entry; some have more.
154
List of hex SHA-1 of each version, or None if not recorded.
162
def __eq__(self, other):
163
if not isinstance(other, Weave):
165
return self._v == other._v \
166
and self._l == other._l
169
def __ne__(self, other):
170
return not self.__eq__(other)
173
def add(self, parents, text):
174
"""Add a single text on top of the weave.
176
Returns the index number of the newly added version.
179
List or set of parent version numbers. This must normally include
180
the parents and the parent's parents, or wierd things might happen.
183
Sequence of lines to be added in the new version."""
184
## self._check_versions(parents)
185
## self._check_lines(text)
196
delta = self._delta(self.inclusions(parents), text)
198
# offset gives the number of lines that have been inserted
199
# into the weave up to the current point; if the original edit instruction
200
# says to change line A then we actually change (A+offset)
203
for i1, i2, newlines in delta:
206
assert i2 <= len(self._l)
208
# the deletion and insertion are handled separately.
209
# first delete the region.
211
self._l.insert(i1+offset, ('[', idx))
212
self._l.insert(i2+offset+1, (']', idx))
217
# there may have been a deletion spanning up to
218
# i2; we want to insert after this region to make sure
219
# we don't destroy ourselves
221
self._l[i:i] = [('{', idx)] \
224
offset += 2 + len(newlines)
226
self._addversion(parents)
228
# special case; adding with no parents revision; can do this
229
# more quickly by just appending unconditionally
230
self._l.append(('{', idx))
232
self._l.append(('}', idx))
234
self._addversion(None)
236
self._sha1s.append(sha1)
241
def inclusions(self, versions):
242
"""Expand out everything included by versions."""
248
raise ValueError("version %d not present in weave" % v)
252
def _addversion(self, parents):
254
self._v.append(frozenset(parents))
256
self._v.append(frozenset())
259
def _check_lines(self, text):
260
if not isinstance(text, list):
261
raise ValueError("text should be a list, not %s" % type(text))
264
if not isinstance(l, basestring):
265
raise ValueError("text line should be a string or unicode, not %s"
270
def _check_versions(self, indexes):
271
"""Check everything in the sequence of indexes is valid"""
276
raise IndexError("invalid version number %r" % i)
279
def annotate(self, index):
280
return list(self.annotate_iter(index))
283
def annotate_iter(self, version):
284
"""Yield list of (index-id, line) pairs for the specified version.
286
The index indicates when the line originated in the weave."""
287
included = self.inclusions([version])
288
for origin, lineno, text in self._extract(included):
292
def _extract(self, included):
293
"""Yield annotation of lines in included set.
295
Yields a sequence of tuples (origin, lineno, text), where
296
origin is the origin version, lineno the index in the weave,
297
and text the text of the line.
299
The set typically but not necessarily corresponds to a version.
301
istack = [] # versions for which an insertion block is current
303
dset = set() # versions for which a deletion block is current
307
lineno = 0 # line of weave, 0-based
309
# TODO: Probably only need to put included revisions in the istack
311
# TODO: Could split this into two functions, one that updates
312
# the stack and the other that processes the results -- but
313
# I'm not sure it's really needed.
315
# TODO: In fact, I think we only need to store the *count* of
316
# active insertions and deletions, and we can maintain that by
317
# just by just counting as we go along.
319
WFE = WeaveFormatError
322
if isinstance(l, tuple):
323
isactive = None # recalculate
326
if istack and (istack[-1] >= v):
327
raise WFE("improperly nested insertions %d>=%d on line %d"
328
% (istack[-1], v, lineno))
334
raise WFE("unmatched close of insertion %d on line %d"
337
raise WFE("mismatched close of insertion %d!=%d on line %d"
342
raise WFE("repeated deletion marker for version %d on line %d"
346
raise WFE("version %d deletes own text on line %d"
354
raise WFE("unmatched close of deletion %d on line %d"
357
raise WFE("invalid processing instruction %r on line %d"
360
assert isinstance(l, basestring)
362
raise WFE("literal at top level on line %d"
365
isactive = (istack[-1] in included) \
366
and not included.intersection(dset)
369
yield origin, lineno, l
373
raise WFE("unclosed insertion blocks at end of weave",
376
raise WFE("unclosed deletion blocks at end of weave",
380
def get_iter(self, version):
381
"""Yield lines for the specified version."""
382
for origin, lineno, line in self._extract(self.inclusions([version])):
386
def get(self, index):
387
return list(self.get_iter(index))
390
def mash_iter(self, included):
391
"""Return composed version of multiple included versions."""
392
included = frozenset(included)
393
for origin, lineno, text in self._extract(included):
397
def dump(self, to_file):
398
from pprint import pprint
399
print >>to_file, "Weave._l = ",
400
pprint(self._l, to_file)
401
print >>to_file, "Weave._v = ",
402
pprint(self._v, to_file)
406
def numversions(self):
408
assert l == len(self._sha1s)
413
# check no circular inclusions
414
for version in range(self.numversions()):
415
inclusions = list(self._v[version])
418
if inclusions[-1] >= version:
419
raise WeaveFormatError("invalid included version %d for index %d"
420
% (inclusions[-1], version))
422
# try extracting all versions; this is a bit slow and parallel
423
# extraction could be used
425
for version in range(self.numversions()):
427
for l in self.get_iter(version):
430
expected = self._sha1s[version]
432
raise WeaveError("mismatched sha1 for version %d; "
433
"got %s, expected %s"
434
% (version, hd, expected))
438
def merge(self, merge_versions):
439
"""Automerge and mark conflicts between versions.
441
This returns a sequence, each entry describing alternatives
442
for a chunk of the file. Each of the alternatives is given as
445
If there is a chunk of the file where there's no diagreement,
446
only one alternative is given.
449
# approach: find the included versions common to all the
451
raise NotImplementedError()
455
def _delta(self, included, lines):
456
"""Return changes from basis to new revision.
458
The old text for comparison is the union of included revisions.
460
This is used in inserting a new text.
462
Delta is returned as a sequence of
463
(weave1, weave2, newlines).
465
This indicates that weave1:weave2 of the old weave should be
466
replaced by the sequence of lines in newlines. Note that
467
these line numbers are positions in the total weave and don't
468
correspond to the lines in any extracted version, or even the
469
extracted union of included versions.
471
If line1=line2, this is a pure insert; if newlines=[] this is a
472
pure delete. (Similar to difflib.)
474
# basis a list of (origin, lineno, line)
477
for origin, lineno, line in self._extract(included):
478
basis_lineno.append(lineno)
479
basis_lines.append(line)
481
# add a sentinal, because we can also match against the final line
482
basis_lineno.append(len(self._l))
484
# XXX: which line of the weave should we really consider
485
# matches the end of the file? the current code says it's the
486
# last line of the weave?
488
from difflib import SequenceMatcher
489
s = SequenceMatcher(None, basis_lines, lines)
491
# TODO: Perhaps return line numbers from composed weave as well?
493
for tag, i1, i2, j1, j2 in s.get_opcodes():
494
##print tag, i1, i2, j1, j2
499
# i1,i2 are given in offsets within basis_lines; we need to map them
500
# back to offsets within the entire weave
501
real_i1 = basis_lineno[i1]
502
real_i2 = basis_lineno[i2]
506
assert j2 <= len(lines)
508
yield real_i1, real_i2, lines[j1:j2]
512
def weave_info(filename, out):
513
"""Show some text information about the weave."""
514
from weavefile import read_weave
515
wf = file(filename, 'rb')
517
# FIXME: doesn't work on pipes
518
weave_size = wf.tell()
519
print >>out, "weave file size %d bytes" % weave_size
520
print >>out, "weave contains %d versions" % len(w._v)
523
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
524
for i in (6, 6, 8, 40, 20):
527
for i in range(len(w._v)):
530
bytes = sum((len(a) for a in text))
532
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
533
print ', '.join(map(str, w._v[i]))
536
print >>out, "versions total %d bytes" % total
537
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
541
print """bzr weave tool
543
Experimental tool for weave algorithm.
547
Create an empty weave file
548
weave get WEAVEFILE VERSION
549
Write out specified version.
550
weave check WEAVEFILE
551
Check consistency of all versions.
553
Display table of contents.
554
weave add WEAVEFILE [BASE...] < NEWTEXT
555
Add NEWTEXT, with specified parent versions.
556
weave annotate WEAVEFILE VERSION
557
Display origin of each line.
558
weave mash WEAVEFILE VERSION...
559
Display composite of all selected versions.
560
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
561
Auto-merge two versions and display conflicts.
565
% weave init foo.weave
567
% weave add foo.weave < foo.txt
570
(create updated version)
572
% weave get foo.weave 0 | diff -u - foo.txt
573
% weave add foo.weave 0 < foo.txt
576
% weave get foo.weave 0 > foo.txt (create forked version)
578
% weave add foo.weave 0 < foo.txt
581
% weave merge foo.weave 1 2 > foo.txt (merge them)
582
% vi foo.txt (resolve conflicts)
583
% weave add foo.weave 1 2 < foo.txt (commit merged version)
592
from weavefile import write_weave, read_weave
596
return read_weave(file(argv[2], 'rb'))
602
# at the moment, based on everything in the file
603
parents = map(int, argv[3:])
604
lines = sys.stdin.readlines()
605
ver = w.add(parents, lines)
606
write_weave(w, file(argv[2], 'wb'))
607
print 'added version %d' % ver
610
if os.path.exists(fn):
611
raise IOError("file exists")
613
write_weave(w, file(fn, 'wb'))
614
elif cmd == 'get': # get one version
616
sys.stdout.writelines(w.get_iter(int(argv[3])))
618
elif cmd == 'mash': # get composite
620
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
622
elif cmd == 'annotate':
624
# newline is added to all lines regardless; too hard to get
625
# reasonable formatting otherwise
627
for origin, text in w.annotate(int(argv[3])):
628
text = text.rstrip('\r\n')
630
print ' | %s' % (text)
632
print '%5d | %s' % (origin, text)
636
weave_info(argv[2], sys.stdout)
648
v1, v2 = map(int, argv[3:5])
650
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
652
base_lines = list(w.mash_iter(basis))
653
a_lines = list(w.get(v1))
654
b_lines = list(w.get(v2))
656
from bzrlib.merge3 import Merge3
657
m3 = Merge3(base_lines, a_lines, b_lines)
659
name_a = 'version %d' % v1
660
name_b = 'version %d' % v2
661
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
663
raise ValueError('unknown command %r' % cmd)
666
if __name__ == '__main__':
668
sys.exit(main(sys.argv))