22
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# making _extract build and return a list, rather than being a generator
31
# with python -O, r923 does 2000 versions in 36.87s
33
# with optimizations to avoid mutating lists - 35.75! I guess copying
34
# all the elements every time costs more than the small manipulations.
35
# a surprisingly small change.
37
# r931, which avoids using a generator for extract, does 36.98s
39
# with memoized inclusions, takes 41.49s; not very good
41
# with slots, takes 37.35s; without takes 39.16, a bit surprising
43
# with the delta calculation mixed in with the add method, rather than
44
# separated, takes 36.78s
46
# with delta folded in and mutation of the list, 36.13s
48
# with all this and simplification of add code, 33s
51
24
# TODO: Perhaps have copy method for Weave instances?
53
26
# XXX: If we do weaves this way, will a merge still behave the same
54
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: How to write these to disk? One option is cPickle, which
31
# would be fast but less friendly to C, and perhaps not portable. Another is
57
33
# TODO: Nothing here so far assumes the lines are really \n newlines,
58
34
# rather than being split up in some other way. We could accomodate
59
35
# binaries, perhaps by naively splitting on \n or perhaps using
60
36
# something like a rolling checksum.
38
# TODO: Perhaps track SHA-1 in the header for protection? This would
39
# be redundant with it being stored in the inventory, but perhaps
62
42
# TODO: Track version names as well as indexes.
44
# TODO: Probably do transitive expansion when specifying parents?
46
# TODO: Separate out some code to read and write weaves.
64
48
# TODO: End marker for each version so we can stop reading?
66
50
# TODO: Check that no insertion occurs inside a deletion that was
67
51
# active in the version of the insertion.
69
# TODO: In addition to the SHA-1 check, perhaps have some code that
70
# checks structural constraints of the weave: ie that insertions are
71
# properly nested, that there is no text outside of an insertion, that
72
# insertions or deletions are not repeated, etc.
74
# TODO: Parallel-extract that passes back each line along with a
75
# description of which revisions include it. Nice for checking all
53
# TODO: Perhaps a special slower check() method that verifies more
54
# nesting constraints and the MD5 of each version?
62
from sets import Set, ImmutableSet
64
frozenset = ImmutableSet
81
68
class WeaveError(Exception):
194
178
Sequence of lines to be added in the new version."""
196
self._check_versions(parents)
179
## self._check_versions(parents)
197
180
## self._check_lines(text)
198
new_version = len(self._parents)
203
187
sha1 = s.hexdigest()
206
# if we abort after here the weave will be corrupt
207
self._parents.append(frozenset(parents))
191
ancestors = self.inclusions(parents)
192
delta = self._delta(ancestors, text)
194
# offset gives the number of lines that have been inserted
195
# into the weave up to the current point; if the original edit instruction
196
# says to change line A then we actually change (A+offset)
199
for i1, i2, newlines in delta:
202
assert i2 <= len(self._l)
204
# the deletion and insertion are handled separately.
205
# first delete the region.
207
self._l.insert(i1+offset, ('[', idx))
208
self._l.insert(i2+offset+1, (']', idx))
213
# there may have been a deletion spanning up to
214
# i2; we want to insert after this region to make sure
215
# we don't destroy ourselves
217
self._l[i:i] = [('{', idx)] \
220
offset += 2 + len(newlines)
222
self._addversion(parents)
224
# special case; adding with no parents revision; can do this
225
# more quickly by just appending unconditionally
226
self._l.append(('{', idx))
228
self._l.append(('}', idx))
230
self._addversion(None)
208
232
self._sha1s.append(sha1)
212
# special case; adding with no parents revision; can do
213
# this more quickly by just appending unconditionally.
214
# even more specially, if we're adding an empty text we
215
# need do nothing at all.
217
self._weave.append(('{', new_version))
218
self._weave.extend(text)
219
self._weave.append(('}', new_version))
223
if len(parents) == 1:
224
pv = list(parents)[0]
225
if sha1 == self._sha1s[pv]:
226
# special case: same as the single parent
230
ancestors = self.inclusions(parents)
234
# basis a list of (origin, lineno, line)
237
for origin, lineno, line in self._extract(ancestors):
238
basis_lineno.append(lineno)
239
basis_lines.append(line)
241
# another small special case: a merge, producing the same text
243
if text == basis_lines:
246
# add a sentinal, because we can also match against the final line
247
basis_lineno.append(len(self._weave))
249
# XXX: which line of the weave should we really consider
250
# matches the end of the file? the current code says it's the
251
# last line of the weave?
253
#print 'basis_lines:', basis_lines
254
#print 'new_lines: ', lines
256
from difflib import SequenceMatcher
257
s = SequenceMatcher(None, basis_lines, text)
259
# offset gives the number of lines that have been inserted
260
# into the weave up to the current point; if the original edit instruction
261
# says to change line A then we actually change (A+offset)
264
for tag, i1, i2, j1, j2 in s.get_opcodes():
265
# i1,i2 are given in offsets within basis_lines; we need to map them
266
# back to offsets within the entire weave
267
#print 'raw match', tag, i1, i2, j1, j2
271
i1 = basis_lineno[i1]
272
i2 = basis_lineno[i2]
274
assert 0 <= j1 <= j2 <= len(text)
276
#print tag, i1, i2, j1, j2
278
# the deletion and insertion are handled separately.
279
# first delete the region.
281
self._weave.insert(i1+offset, ('[', new_version))
282
self._weave.insert(i2+offset+1, (']', new_version))
286
# there may have been a deletion spanning up to
287
# i2; we want to insert after this region to make sure
288
# we don't destroy ourselves
290
self._weave[i:i] = ([('{', new_version)]
292
+ [('}', new_version)])
293
offset += 2 + (j2 - j1)
298
237
def inclusions(self, versions):
475
378
def mash_iter(self, included):
476
379
"""Return composed version of multiple included versions."""
380
included = frozenset(included)
477
381
for origin, lineno, text in self._extract(included):
481
385
def dump(self, to_file):
482
386
from pprint import pprint
483
print >>to_file, "Weave._weave = ",
484
pprint(self._weave, to_file)
485
print >>to_file, "Weave._parents = ",
486
pprint(self._parents, to_file)
387
print >>to_file, "Weave._l = ",
388
pprint(self._l, to_file)
389
print >>to_file, "Weave._v = ",
390
pprint(self._v, to_file)
490
394
def numversions(self):
491
l = len(self._parents)
492
396
assert l == len(self._sha1s)
497
return self.numversions()
500
400
def check(self, progress_bar=None):
501
401
# check no circular inclusions
502
402
for version in range(self.numversions()):
503
inclusions = list(self._parents[version])
403
inclusions = list(self._v[version])
505
405
inclusions.sort()
506
406
if inclusions[-1] >= version:
566
466
If line1=line2, this is a pure insert; if newlines=[] this is a
567
467
pure delete. (Similar to difflib.)
572
def plan_merge(self, ver_a, ver_b):
573
"""Return pseudo-annotation indicating how the two versions merge.
575
This is computed between versions a and b and their common
578
Weave lines present in none of them are skipped entirely.
580
inc_a = self.inclusions([ver_a])
581
inc_b = self.inclusions([ver_b])
582
inc_c = inc_a & inc_b
584
for lineno, insert, deleteset, line in self._walk():
585
if deleteset & inc_c:
586
# killed in parent; can't be in either a or b
587
# not relevant to our work
588
yield 'killed-base', line
589
elif insert in inc_c:
590
# was inserted in base
591
killed_a = bool(deleteset & inc_a)
592
killed_b = bool(deleteset & inc_b)
593
if killed_a and killed_b:
594
yield 'killed-both', line
596
yield 'killed-a', line
598
yield 'killed-b', line
600
yield 'unchanged', line
601
elif insert in inc_a:
602
if deleteset & inc_a:
603
yield 'ghost-a', line
607
elif insert in inc_b:
608
if deleteset & inc_b:
609
yield 'ghost-b', line
613
# not in either revision
614
yield 'irrelevant', line
616
yield 'unchanged', '' # terminator
620
def weave_merge(self, plan):
625
for state, line in plan:
626
if state == 'unchanged' or state == 'killed-both':
627
# resync and flush queued conflicts changes if any
628
if not lines_a and not lines_b:
630
elif ch_a and not ch_b:
632
for l in lines_a: yield l
633
elif ch_b and not ch_a:
634
for l in lines_b: yield l
635
elif lines_a == lines_b:
636
for l in lines_a: yield l
639
for l in lines_a: yield l
641
for l in lines_b: yield l
648
if state == 'unchanged':
651
elif state == 'killed-a':
654
elif state == 'killed-b':
657
elif state == 'new-a':
660
elif state == 'new-b':
664
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
469
# basis a list of (origin, lineno, line)
472
for origin, lineno, line in self._extract(included):
473
basis_lineno.append(lineno)
474
basis_lines.append(line)
476
# add a sentinal, because we can also match against the final line
477
basis_lineno.append(len(self._l))
479
# XXX: which line of the weave should we really consider
480
# matches the end of the file? the current code says it's the
481
# last line of the weave?
483
from difflib import SequenceMatcher
484
s = SequenceMatcher(None, basis_lines, lines)
486
# TODO: Perhaps return line numbers from composed weave as well?
488
for tag, i1, i2, j1, j2 in s.get_opcodes():
489
##print tag, i1, i2, j1, j2
494
# i1,i2 are given in offsets within basis_lines; we need to map them
495
# back to offsets within the entire weave
496
real_i1 = basis_lineno[i1]
497
real_i2 = basis_lineno[i2]
501
assert j2 <= len(lines)
503
yield real_i1, real_i2, lines[j1:j2]
507
def weave_info(filename, out):
675
508
"""Show some text information about the weave."""
676
print '%6s %40s %20s' % ('ver', 'sha1', 'parents')
677
for i in (6, 40, 20):
680
for i in range(w.numversions()):
682
print '%6d %40s %s' % (i, sha1, ' '.join(map(str, w._parents[i])))
686
def weave_stats(weave_file):
687
from bzrlib.progress import ProgressBar
688
from bzrlib.weavefile import read_weave
692
wf = file(weave_file, 'rb')
509
from weavefile import read_weave
510
wf = file(filename, 'rb')
693
511
w = read_weave(wf)
694
512
# FIXME: doesn't work on pipes
695
513
weave_size = wf.tell()
514
print >>out, "weave file size %d bytes" % weave_size
515
print >>out, "weave contains %d versions" % len(w._v)
699
for i in range(vers):
700
pb.update('checking sizes', i, vers)
701
for line in w.get_iter(i):
706
print 'versions %9d' % vers
707
print 'weave file %9d bytes' % weave_size
708
print 'total contents %9d bytes' % total
709
print 'compression ratio %9.2fx' % (float(total) / float(weave_size))
712
print 'average size %9d bytes' % avg
713
print 'relative size %9.2fx' % (float(weave_size) / float(avg))
518
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
519
for i in (6, 6, 8, 40, 20):
522
for i in range(len(w._v)):
525
bytes = sum((len(a) for a in text))
527
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
533
print >>out, "versions total %d bytes" % total
534
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))