192
179
yield s_pos + (target_len - t_pos), target_len, 0
195
class _KnitFactory(object):
196
"""Base factory for creating content objects."""
198
def make(self, lines, version_id):
199
num_lines = len(lines)
200
return KnitContent(zip([version_id] * num_lines, lines))
203
class KnitAnnotateFactory(_KnitFactory):
182
class AnnotatedKnitContent(KnitContent):
183
"""Annotated content."""
185
def __init__(self, lines):
188
def annotate_iter(self):
189
"""Yield tuples of (origin, text) for each content line."""
190
return iter(self._lines)
192
def strip_last_line_newline(self):
193
line = self._lines[-1][1].rstrip('\n')
194
self._lines[-1] = (self._lines[-1][0], line)
197
return [text for origin, text in self._lines]
200
return AnnotatedKnitContent(self._lines[:])
203
class PlainKnitContent(KnitContent):
204
"""Unannotated content.
206
When annotate[_iter] is called on this content, the same version is reported
207
for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent
211
def __init__(self, lines, version_id):
213
self._version_id = version_id
215
def annotate_iter(self):
216
"""Yield tuples of (origin, text) for each content line."""
217
for line in self._lines:
218
yield self._version_id, line
221
return PlainKnitContent(self._lines[:], self._version_id)
223
def strip_last_line_newline(self):
224
self._lines[-1] = self._lines[-1].rstrip('\n')
230
class KnitAnnotateFactory(object):
204
231
"""Factory for creating annotated Content objects."""
235
def make(self, lines, version_id):
236
num_lines = len(lines)
237
return AnnotatedKnitContent(zip([version_id] * num_lines, lines))
208
239
def parse_fulltext(self, content, version_id):
209
240
"""Convert fulltext to internal representation
243
278
return cache.setdefault(origin, origin), text
245
280
# walk through the lines parsing.
247
start, end, count = [int(n) for n in header.split(',')]
248
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
249
result.append((start, end, count, contents))
281
# Note that the plain test is explicitly pulled out of the
282
# loop to minimise any performance impact
285
start, end, count = [int(n) for n in header.split(',')]
286
contents = [next().split(' ', 1)[1] for i in xrange(count)]
287
result.append((start, end, count, contents))
290
start, end, count = [int(n) for n in header.split(',')]
291
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
292
result.append((start, end, count, contents))
252
295
def get_fulltext_content(self, lines):
454
495
return fulltext_size > delta_size
456
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
457
"""See VersionedFile._add_delta()."""
458
self._check_add(version_id, []) # should we check the lines ?
459
self._check_versions_present(parents)
463
for parent in parents:
464
if not self.has_version(parent):
465
ghosts.append(parent)
467
present_parents.append(parent)
469
if delta_parent is None:
470
# reconstitute as full text.
471
assert len(delta) == 1 or len(delta) == 0
473
assert delta[0][0] == 0
474
assert delta[0][1] == 0, delta[0][1]
475
return super(KnitVersionedFile, self)._add_delta(version_id,
486
options.append('no-eol')
488
if delta_parent is not None:
489
# determine the current delta chain length.
490
# To speed the extract of texts the delta chain is limited
491
# to a fixed number of deltas. This should minimize both
492
# I/O and the time spend applying deltas.
493
# The window was changed to a maximum of 200 deltas, but also added
494
# was a check that the total compressed size of the deltas is
495
# smaller than the compressed size of the fulltext.
496
if not self._check_should_delta([delta_parent]):
497
# We don't want a delta here, just do a normal insertion.
498
return super(KnitVersionedFile, self)._add_delta(version_id,
505
options.append('line-delta')
506
store_lines = self.factory.lower_line_delta(delta)
508
access_memo = self._data.add_record(version_id, digest, store_lines)
509
self._index.add_version(version_id, options, access_memo, parents)
511
497
def _add_raw_records(self, records, data):
512
498
"""Add all the records 'records' with data pre-joined in 'data'.
865
833
"""Check that all specified versions are present."""
866
834
self._index.check_versions_present(version_ids)
868
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
836
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,
837
nostore_sha, random_id, check_content):
869
838
"""See VersionedFile.add_lines_with_ghosts()."""
870
self._check_add(version_id, lines)
871
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
839
self._check_add(version_id, lines, random_id, check_content)
840
return self._add(version_id, lines, parents, self.delta,
841
parent_texts, None, nostore_sha, random_id)
873
843
def _add_lines(self, version_id, parents, lines, parent_texts,
874
left_matching_blocks=None):
844
left_matching_blocks, nostore_sha, random_id, check_content):
875
845
"""See VersionedFile.add_lines."""
876
self._check_add(version_id, lines)
846
self._check_add(version_id, lines, random_id, check_content)
877
847
self._check_versions_present(parents)
878
848
return self._add(version_id, lines[:], parents, self.delta,
879
parent_texts, left_matching_blocks)
849
parent_texts, left_matching_blocks, nostore_sha, random_id)
881
def _check_add(self, version_id, lines):
851
def _check_add(self, version_id, lines, random_id, check_content):
882
852
"""check that version_id and lines are safe to add."""
883
assert self.writable, "knit is not opened for write"
884
### FIXME escape. RBC 20060228
885
853
if contains_whitespace(version_id):
886
854
raise InvalidRevisionId(version_id, self.filename)
887
855
self.check_not_reserved_id(version_id)
888
if self.has_version(version_id):
856
# Technically this could be avoided if we are happy to allow duplicate
857
# id insertion when other things than bzr core insert texts, but it
858
# seems useful for folk using the knit api directly to have some safety
859
# blanket that we can disable.
860
if not random_id and self.has_version(version_id):
889
861
raise RevisionAlreadyPresent(version_id, self.filename)
890
self._check_lines_not_unicode(lines)
891
self._check_lines_are_lines(lines)
863
self._check_lines_not_unicode(lines)
864
self._check_lines_are_lines(lines)
893
866
def _add(self, version_id, lines, parents, delta, parent_texts,
894
left_matching_blocks=None):
867
left_matching_blocks, nostore_sha, random_id):
895
868
"""Add a set of lines on top of version specified by parents.
897
870
If delta is true, compress the text as a line-delta against
900
873
Any versions not present will be converted into ghosts.
902
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
903
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
904
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
905
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
906
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
907
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
908
# +1383 0 8.0370 8.0370 +<len>
909
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
910
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
911
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
912
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
875
# first thing, if the content is something we don't need to store, find
877
line_bytes = ''.join(lines)
878
digest = sha_string(line_bytes)
879
if nostore_sha == digest:
880
raise errors.ExistingContent
914
882
present_parents = []
916
883
if parent_texts is None:
917
884
parent_texts = {}
918
885
for parent in parents:
919
if not self.has_version(parent):
920
ghosts.append(parent)
886
if self.has_version(parent):
922
887
present_parents.append(parent)
924
if delta and not len(present_parents):
889
# can only compress against the left most present parent.
891
(len(present_parents) == 0 or
892
present_parents[0] != parents[0])):
927
digest = sha_strings(lines)
895
text_length = len(line_bytes)
930
898
if lines[-1][-1] != '\n':
899
# copy the contents of lines.
931
901
options.append('no-eol')
932
902
lines[-1] = lines[-1] + '\n'
934
if len(present_parents) and delta:
935
905
# To speed the extract of texts the delta chain is limited
936
906
# to a fixed number of deltas. This should minimize both
937
907
# I/O and the time spend applying deltas.
938
908
delta = self._check_should_delta(present_parents)
940
910
assert isinstance(version_id, str)
941
lines = self.factory.make(lines, version_id)
911
content = self.factory.make(lines, version_id)
942
912
if delta or (self.factory.annotated and len(present_parents) > 0):
943
# Merge annotations from parent texts if so is needed.
944
delta_hunks = self._merge_annotations(lines, present_parents,
913
# Merge annotations from parent texts if needed.
914
delta_hunks = self._merge_annotations(content, present_parents,
945
915
parent_texts, delta, self.factory.annotated,
946
916
left_matching_blocks)
949
919
options.append('line-delta')
950
920
store_lines = self.factory.lower_line_delta(delta_hunks)
921
size, bytes = self._data._record_to_data(version_id, digest,
952
924
options.append('fulltext')
953
store_lines = self.factory.lower_fulltext(lines)
925
# get mixed annotation + content and feed it into the
927
store_lines = self.factory.lower_fulltext(content)
928
size, bytes = self._data._record_to_data(version_id, digest,
955
access_memo = self._data.add_record(version_id, digest, store_lines)
956
self._index.add_version(version_id, options, access_memo, parents)
931
access_memo = self._data.add_raw_records([size], bytes)[0]
932
self._index.add_versions(
933
((version_id, options, access_memo, parents),),
935
return digest, text_length, content
959
937
def check(self, progress_bar=None):
960
938
"""See VersionedFile.check()."""
1044
1019
elif method == 'line-delta':
1045
1020
delta = self.factory.parse_line_delta(data, version_id)
1046
1021
content = content.copy()
1047
content._lines = self._apply_delta(content._lines,
1022
content._lines = self._apply_delta(content._lines,
1049
1024
content_map[component_id] = content
1051
1026
if 'no-eol' in self._index.get_options(version_id):
1052
1027
content = content.copy()
1053
line = content._lines[-1][1].rstrip('\n')
1054
content._lines[-1] = (content._lines[-1][0], line)
1028
content.strip_last_line_newline()
1055
1029
final_content[version_id] = content
1057
1031
# digest here is the digest from the last applied component.
1058
1032
text = content.text()
1059
1033
if sha_strings(text) != digest:
1060
raise KnitCorrupt(self.filename,
1034
raise KnitCorrupt(self.filename,
1061
1035
'sha-1 does not match %s' % version_id)
1063
text_map[version_id] = text
1064
return text_map, final_content
1037
text_map[version_id] = text
1038
return text_map, final_content
1041
def _apply_delta(lines, delta):
1042
"""Apply delta to lines."""
1045
for start, end, count, delta_lines in delta:
1046
lines[offset+start:offset+end] = delta_lines
1047
offset = offset + (start - end) + count
1066
1050
def iter_lines_added_or_present_in_versions(self, version_ids=None,
2228
2217
self.source_ancestry = set(self.source.get_ancestry(version_ids))
2229
2218
this_versions = set(self.target._index.get_versions())
2219
# XXX: For efficiency we should not look at the whole index,
2220
# we only need to consider the referenced revisions - they
2221
# must all be present, or the method must be full-text.
2222
# TODO, RBC 20070919
2230
2223
needed_versions = self.source_ancestry - this_versions
2231
cross_check_versions = self.source_ancestry.intersection(this_versions)
2232
mismatched_versions = set()
2233
for version in cross_check_versions:
2234
# scan to include needed parents.
2235
n1 = set(self.target.get_parents_with_ghosts(version))
2236
n2 = set(self.source.get_parents_with_ghosts(version))
2238
# FIXME TEST this check for cycles being introduced works
2239
# the logic is we have a cycle if in our graph we are an
2240
# ancestor of any of the n2 revisions.
2246
parent_ancestors = self.source.get_ancestry(parent)
2247
if version in parent_ancestors:
2248
raise errors.GraphCycleError([parent, version])
2249
# ensure this parent will be available later.
2250
new_parents = n2.difference(n1)
2251
needed_versions.update(new_parents.difference(this_versions))
2252
mismatched_versions.add(version)
2254
if not needed_versions and not mismatched_versions:
2225
if not needed_versions:
2256
2227
full_list = topo_sort(self.source.get_graph())
2291
2262
assert version_id == version_id2, 'logic error, inconsistent results'
2292
2263
count = count + 1
2293
2264
pb.update("Joining knit", count, total)
2294
raw_records.append((version_id, options, parents, len(raw_data)))
2266
size, raw_data = converter(raw_data, version_id, options,
2269
size = len(raw_data)
2270
raw_records.append((version_id, options, parents, size))
2295
2271
raw_datum.append(raw_data)
2296
2272
self.target._add_raw_records(raw_records, ''.join(raw_datum))
2298
for version in mismatched_versions:
2299
# FIXME RBC 20060309 is this needed?
2300
n1 = set(self.target.get_parents_with_ghosts(version))
2301
n2 = set(self.source.get_parents_with_ghosts(version))
2302
# write a combined record to our history preserving the current
2303
# parents as first in the list
2304
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
2305
self.target.fix_parents(version, new_parents)
2277
def _anno_to_plain_converter(self, raw_data, version_id, options,
2279
"""Convert annotated content to plain content."""
2280
data, digest = self.source._data._parse_record(version_id, raw_data)
2281
if 'fulltext' in options:
2282
content = self.source.factory.parse_fulltext(data, version_id)
2283
lines = self.target.factory.lower_fulltext(content)
2285
delta = self.source.factory.parse_line_delta(data, version_id,
2287
lines = self.target.factory.lower_line_delta(delta)
2288
return self.target._data._record_to_data(version_id, digest, lines)
2311
2291
InterVersionedFile.register_optimiser(InterKnit)
2343
2323
self.source_ancestry = set(self.source.get_ancestry(version_ids))
2344
2324
this_versions = set(self.target._index.get_versions())
2345
2325
needed_versions = self.source_ancestry - this_versions
2346
cross_check_versions = self.source_ancestry.intersection(this_versions)
2347
mismatched_versions = set()
2348
for version in cross_check_versions:
2349
# scan to include needed parents.
2350
n1 = set(self.target.get_parents_with_ghosts(version))
2351
n2 = set(self.source.get_parents(version))
2352
# if all of n2's parents are in n1, then its fine.
2353
if n2.difference(n1):
2354
# FIXME TEST this check for cycles being introduced works
2355
# the logic is we have a cycle if in our graph we are an
2356
# ancestor of any of the n2 revisions.
2362
parent_ancestors = self.source.get_ancestry(parent)
2363
if version in parent_ancestors:
2364
raise errors.GraphCycleError([parent, version])
2365
# ensure this parent will be available later.
2366
new_parents = n2.difference(n1)
2367
needed_versions.update(new_parents.difference(this_versions))
2368
mismatched_versions.add(version)
2370
if not needed_versions and not mismatched_versions:
2327
if not needed_versions:
2372
2329
full_list = topo_sort(self.source.get_graph())
2404
2352
InterVersionedFile.register_optimiser(WeaveToKnit)
2407
class KnitSequenceMatcher(difflib.SequenceMatcher):
2408
"""Knit tuned sequence matcher.
2410
This is based on profiling of difflib which indicated some improvements
2411
for our usage pattern.
2414
def find_longest_match(self, alo, ahi, blo, bhi):
2415
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2417
If isjunk is not defined:
2419
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
2420
alo <= i <= i+k <= ahi
2421
blo <= j <= j+k <= bhi
2422
and for all (i',j',k') meeting those conditions,
2425
and if i == i', j <= j'
2427
In other words, of all maximal matching blocks, return one that
2428
starts earliest in a, and of all those maximal matching blocks that
2429
start earliest in a, return the one that starts earliest in b.
2431
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
2432
>>> s.find_longest_match(0, 5, 0, 9)
2435
If isjunk is defined, first the longest matching block is
2436
determined as above, but with the additional restriction that no
2437
junk element appears in the block. Then that block is extended as
2438
far as possible by matching (only) junk elements on both sides. So
2439
the resulting block never matches on junk except as identical junk
2440
happens to be adjacent to an "interesting" match.
2442
Here's the same example as before, but considering blanks to be
2443
junk. That prevents " abcd" from matching the " abcd" at the tail
2444
end of the second sequence directly. Instead only the "abcd" can
2445
match, and matches the leftmost "abcd" in the second sequence:
2447
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
2448
>>> s.find_longest_match(0, 5, 0, 9)
2451
If no blocks match, return (alo, blo, 0).
2453
>>> s = SequenceMatcher(None, "ab", "c")
2454
>>> s.find_longest_match(0, 2, 0, 1)
2458
# CAUTION: stripping common prefix or suffix would be incorrect.
2462
# Longest matching block is "ab", but if common prefix is
2463
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
2464
# strip, so ends up claiming that ab is changed to acab by
2465
# inserting "ca" in the middle. That's minimal but unintuitive:
2466
# "it's obvious" that someone inserted "ac" at the front.
2467
# Windiff ends up at the same place as diff, but by pairing up
2468
# the unique 'b's and then matching the first two 'a's.
2470
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
2471
besti, bestj, bestsize = alo, blo, 0
2472
# find longest junk-free match
2473
# during an iteration of the loop, j2len[j] = length of longest
2474
# junk-free match ending with a[i-1] and b[j]
2478
for i in xrange(alo, ahi):
2479
# look at all instances of a[i] in b; note that because
2480
# b2j has no junk keys, the loop is skipped if a[i] is junk
2481
j2lenget = j2len.get
2484
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
2485
# following improvement
2486
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
2487
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
2488
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
2490
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
2491
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
2492
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
2504
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
2506
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
2509
# Extend the best by non-junk elements on each end. In particular,
2510
# "popular" non-junk elements aren't in b2j, which greatly speeds
2511
# the inner loop above, but also means "the best" match so far
2512
# doesn't contain any junk *or* popular non-junk elements.
2513
while besti > alo and bestj > blo and \
2514
not isbjunk(b[bestj-1]) and \
2515
a[besti-1] == b[bestj-1]:
2516
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2517
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2518
not isbjunk(b[bestj+bestsize]) and \
2519
a[besti+bestsize] == b[bestj+bestsize]:
2522
# Now that we have a wholly interesting match (albeit possibly
2523
# empty!), we may as well suck up the matching junk on each
2524
# side of it too. Can't think of a good reason not to, and it
2525
# saves post-processing the (possibly considerable) expense of
2526
# figuring out what to do with it. In the case of an empty
2527
# interesting match, this is clearly the right thing to do,
2528
# because no other kind of match is possible in the regions.
2529
while besti > alo and bestj > blo and \
2530
isbjunk(b[bestj-1]) and \
2531
a[besti-1] == b[bestj-1]:
2532
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2533
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2534
isbjunk(b[bestj+bestsize]) and \
2535
a[besti+bestsize] == b[bestj+bestsize]:
2536
bestsize = bestsize + 1
2538
return besti, bestj, bestsize
2355
# Deprecated, use PatienceSequenceMatcher instead
2356
KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher
2541
2359
def annotate_knit(knit, revision_id):