22
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# making _extract build and return a list, rather than being a generator
31
# with python -O, r923 does 2000 versions in 36.87s
33
# with optimizations to avoid mutating lists - 35.75! I guess copying
34
# all the elements every time costs more than the small manipulations.
35
# a surprisingly small change.
37
# r931, which avoids using a generator for extract, does 36.98s
39
# with memoized inclusions, takes 41.49s; not very good
41
# with slots, takes 37.35s; without takes 39.16, a bit surprising
43
# with the delta calculation mixed in with the add method, rather than
44
# separated, takes 36.78s
46
# with delta folded in and mutation of the list, 36.13s
48
# with all this and simplification of add code, 33s
51
24
# TODO: Perhaps have copy method for Weave instances?
53
26
# XXX: If we do weaves this way, will a merge still behave the same
194
180
Sequence of lines to be added in the new version."""
196
self._check_versions(parents)
181
## self._check_versions(parents)
197
182
## self._check_lines(text)
198
new_version = len(self._parents)
203
189
sha1 = s.hexdigest()
206
# if we abort after here the weave will be corrupt
207
self._parents.append(frozenset(parents))
192
# TODO: It'd probably be faster to append things on to a new
193
# list rather than modifying the existing one, which is likely
194
# to cause a lot of copying.
197
ancestors = self.inclusions(parents)
198
delta = self._delta(ancestors, text)
200
# offset gives the number of lines that have been inserted
201
# into the weave up to the current point; if the original edit instruction
202
# says to change line A then we actually change (A+offset)
205
for i1, i2, newlines in delta:
208
assert i2 <= len(self._l)
210
# the deletion and insertion are handled separately.
211
# first delete the region.
213
self._l.insert(i1+offset, ('[', idx))
214
self._l.insert(i2+offset+1, (']', idx))
219
# there may have been a deletion spanning up to
220
# i2; we want to insert after this region to make sure
221
# we don't destroy ourselves
223
self._l[i:i] = [('{', idx)] \
226
offset += 2 + len(newlines)
228
self._addversion(parents)
230
# special case; adding with no parents revision; can do this
231
# more quickly by just appending unconditionally
232
self._l.append(('{', idx))
234
self._l.append(('}', idx))
236
self._addversion(None)
208
238
self._sha1s.append(sha1)
212
# special case; adding with no parents revision; can do
213
# this more quickly by just appending unconditionally.
214
# even more specially, if we're adding an empty text we
215
# need do nothing at all.
217
self._weave.append(('{', new_version))
218
self._weave.extend(text)
219
self._weave.append(('}', new_version))
223
if len(parents) == 1:
224
pv = list(parents)[0]
225
if sha1 == self._sha1s[pv]:
226
# special case: same as the single parent
230
ancestors = self.inclusions(parents)
234
# basis a list of (origin, lineno, line)
237
for origin, lineno, line in self._extract(ancestors):
238
basis_lineno.append(lineno)
239
basis_lines.append(line)
241
# another small special case: a merge, producing the same text as auto-merge
242
if text == basis_lines:
245
# add a sentinal, because we can also match against the final line
246
basis_lineno.append(len(self._weave))
248
# XXX: which line of the weave should we really consider
249
# matches the end of the file? the current code says it's the
250
# last line of the weave?
252
#print 'basis_lines:', basis_lines
253
#print 'new_lines: ', lines
255
from difflib import SequenceMatcher
256
s = SequenceMatcher(None, basis_lines, text)
258
# offset gives the number of lines that have been inserted
259
# into the weave up to the current point; if the original edit instruction
260
# says to change line A then we actually change (A+offset)
263
for tag, i1, i2, j1, j2 in s.get_opcodes():
264
# i1,i2 are given in offsets within basis_lines; we need to map them
265
# back to offsets within the entire weave
266
#print 'raw match', tag, i1, i2, j1, j2
270
i1 = basis_lineno[i1]
271
i2 = basis_lineno[i2]
273
assert 0 <= j1 <= j2 <= len(text)
275
#print tag, i1, i2, j1, j2
277
# the deletion and insertion are handled separately.
278
# first delete the region.
280
self._weave.insert(i1+offset, ('[', new_version))
281
self._weave.insert(i2+offset+1, (']', new_version))
285
# there may have been a deletion spanning up to
286
# i2; we want to insert after this region to make sure
287
# we don't destroy ourselves
289
self._weave[i:i] = ([('{', new_version)]
291
+ [('}', new_version)])
292
offset += 2 + (j2 - j1)
243
def inclusions_bitset(self, versions):
250
# if v is included, include all its parents
251
for pv in self._v[v]:
297
257
def inclusions(self, versions):
474
438
def mash_iter(self, included):
475
439
"""Return composed version of multiple included versions."""
440
included = frozenset(included)
476
441
for origin, lineno, text in self._extract(included):
480
445
def dump(self, to_file):
481
446
from pprint import pprint
482
print >>to_file, "Weave._weave = ",
483
pprint(self._weave, to_file)
484
print >>to_file, "Weave._parents = ",
485
pprint(self._parents, to_file)
447
print >>to_file, "Weave._l = ",
448
pprint(self._l, to_file)
449
print >>to_file, "Weave._v = ",
450
pprint(self._v, to_file)
489
454
def numversions(self):
490
l = len(self._parents)
491
456
assert l == len(self._sha1s)
496
return self.numversions()
499
460
def check(self, progress_bar=None):
500
461
# check no circular inclusions
501
462
for version in range(self.numversions()):
502
inclusions = list(self._parents[version])
463
inclusions = list(self._v[version])
504
465
inclusions.sort()
505
466
if inclusions[-1] >= version:
565
526
If line1=line2, this is a pure insert; if newlines=[] this is a
566
527
pure delete. (Similar to difflib.)
529
# basis a list of (origin, lineno, line)
532
for origin, lineno, line in self._extract(included):
533
basis_lineno.append(lineno)
534
basis_lines.append(line)
536
# add a sentinal, because we can also match against the final line
537
basis_lineno.append(len(self._l))
539
# XXX: which line of the weave should we really consider
540
# matches the end of the file? the current code says it's the
541
# last line of the weave?
543
from difflib import SequenceMatcher
544
s = SequenceMatcher(None, basis_lines, lines)
546
# TODO: Perhaps return line numbers from composed weave as well?
548
for tag, i1, i2, j1, j2 in s.get_opcodes():
549
##print tag, i1, i2, j1, j2
554
# i1,i2 are given in offsets within basis_lines; we need to map them
555
# back to offsets within the entire weave
556
real_i1 = basis_lineno[i1]
557
real_i2 = basis_lineno[i2]
561
assert j2 <= len(lines)
563
yield real_i1, real_i2, lines[j1:j2]
670
def weave_info(filename, out):
674
671
"""Show some text information about the weave."""
675
print '%6s %40s %20s' % ('ver', 'sha1', 'parents')
676
for i in (6, 40, 20):
679
for i in range(w.numversions()):
681
print '%6d %40s %s' % (i, sha1, ' '.join(map(str, w._parents[i])))
685
def weave_stats(weave_file):
686
from bzrlib.progress import ProgressBar
687
from bzrlib.weavefile import read_weave
691
wf = file(weave_file, 'rb')
672
from weavefile import read_weave
673
wf = file(filename, 'rb')
692
674
w = read_weave(wf)
693
675
# FIXME: doesn't work on pipes
694
676
weave_size = wf.tell()
677
print >>out, "weave file size %d bytes" % weave_size
678
print >>out, "weave contains %d versions" % len(w._v)
698
for i in range(vers):
699
pb.update('checking sizes', i, vers)
700
for line in w.get_iter(i):
705
print 'versions %9d' % vers
706
print 'weave file %9d bytes' % weave_size
707
print 'total contents %9d bytes' % total
708
print 'compression ratio %9.2fx' % (float(total) / float(weave_size))
681
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
682
for i in (6, 6, 8, 40, 20):
685
for i in range(len(w._v)):
688
bytes = sum((len(a) for a in text))
690
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
696
print >>out, "versions total %d bytes" % total
697
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))