134
111
class KnitContent(object):
135
112
"""Content of a knit version to which deltas can be applied."""
114
def __init__(self, lines):
117
def annotate_iter(self):
118
"""Yield tuples of (origin, text) for each content line."""
119
for origin, text in self._lines:
137
122
def annotate(self):
138
123
"""Return a list of (origin, text) tuples."""
139
124
return list(self.annotate_iter())
141
def apply_delta(self, delta, new_version_id):
142
"""Apply delta to this object to become new_version_id."""
143
raise NotImplementedError(self.apply_delta)
145
126
def line_delta_iter(self, new_lines):
146
127
"""Generate line-based delta from this content to new_lines."""
147
new_texts = new_lines.text()
148
old_texts = self.text()
149
s = patiencediff.PatienceSequenceMatcher(None, old_texts, new_texts)
150
for tag, i1, i2, j1, j2 in s.get_opcodes():
128
new_texts = [text for origin, text in new_lines._lines]
129
old_texts = [text for origin, text in self._lines]
130
s = KnitSequenceMatcher(None, old_texts, new_texts)
131
for op in s.get_opcodes():
153
# ofrom, oto, length, data
154
yield i1, i2, j2 - j1, new_lines._lines[j1:j2]
134
# ofrom oto length data
135
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
156
137
def line_delta(self, new_lines):
157
138
return list(self.line_delta_iter(new_lines))
160
def get_line_delta_blocks(knit_delta, source, target):
161
"""Extract SequenceMatcher.get_matching_blocks() from a knit delta"""
162
target_len = len(target)
165
for s_begin, s_end, t_len, new_text in knit_delta:
166
true_n = s_begin - s_pos
169
# knit deltas do not provide reliable info about whether the
170
# last line of a file matches, due to eol handling.
171
if source[s_pos + n -1] != target[t_pos + n -1]:
174
yield s_pos, t_pos, n
175
t_pos += t_len + true_n
177
n = target_len - t_pos
179
if source[s_pos + n -1] != target[t_pos + n -1]:
182
yield s_pos, t_pos, n
183
yield s_pos + (target_len - t_pos), target_len, 0
186
class AnnotatedKnitContent(KnitContent):
187
"""Annotated content."""
189
def __init__(self, lines):
192
def annotate_iter(self):
193
"""Yield tuples of (origin, text) for each content line."""
194
return iter(self._lines)
196
def apply_delta(self, delta, new_version_id):
197
"""Apply delta to this object to become new_version_id."""
200
for start, end, count, delta_lines in delta:
201
lines[offset+start:offset+end] = delta_lines
202
offset = offset + (start - end) + count
204
def strip_last_line_newline(self):
205
line = self._lines[-1][1].rstrip('\n')
206
self._lines[-1] = (self._lines[-1][0], line)
210
return [text for origin, text in self._lines]
211
except ValueError, e:
212
# most commonly (only?) caused by the internal form of the knit
213
# missing annotation information because of a bug - see thread
215
raise KnitCorrupt(self,
216
"line in annotated knit missing annotation information: %s"
220
return AnnotatedKnitContent(self._lines[:])
223
class PlainKnitContent(KnitContent):
224
"""Unannotated content.
226
When annotate[_iter] is called on this content, the same version is reported
227
for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent
231
def __init__(self, lines, version_id):
233
self._version_id = version_id
235
def annotate_iter(self):
236
"""Yield tuples of (origin, text) for each content line."""
237
for line in self._lines:
238
yield self._version_id, line
240
def apply_delta(self, delta, new_version_id):
241
"""Apply delta to this object to become new_version_id."""
244
for start, end, count, delta_lines in delta:
245
lines[offset+start:offset+end] = delta_lines
246
offset = offset + (start - end) + count
247
self._version_id = new_version_id
250
return PlainKnitContent(self._lines[:], self._version_id)
252
def strip_last_line_newline(self):
253
self._lines[-1] = self._lines[-1].rstrip('\n')
259
class KnitAnnotateFactory(object):
141
return [text for origin, text in self._lines]
144
return KnitContent(self._lines[:])
147
class _KnitFactory(object):
148
"""Base factory for creating content objects."""
150
def make(self, lines, version):
151
num_lines = len(lines)
152
return KnitContent(zip([version] * num_lines, lines))
155
class KnitAnnotateFactory(_KnitFactory):
260
156
"""Factory for creating annotated Content objects."""
264
def make(self, lines, version_id):
265
num_lines = len(lines)
266
return AnnotatedKnitContent(zip([version_id] * num_lines, lines))
268
def parse_fulltext(self, content, version_id):
160
def parse_fulltext(self, content, version):
269
161
"""Convert fulltext to internal representation
271
163
fulltext content is of the format
292
185
revid(utf8) newline\n
293
186
internal representation is
294
187
(start, end, count, [1..count tuples (revid, newline)])
296
:param plain: If True, the lines are returned as a plain
297
list without annotations, not as a list of (origin, content) tuples, i.e.
298
(start, end, count, [1..count newline])
189
decode_utf8 = cache_utf8.decode
301
191
lines = iter(lines)
302
192
next = lines.next
305
def cache_and_return(line):
306
origin, text = line.split(' ', 1)
307
return cache.setdefault(origin, origin), text
309
193
# walk through the lines parsing.
310
# Note that the plain test is explicitly pulled out of the
311
# loop to minimise any performance impact
314
start, end, count = [int(n) for n in header.split(',')]
315
contents = [next().split(' ', 1)[1] for i in xrange(count)]
316
result.append((start, end, count, contents))
319
start, end, count = [int(n) for n in header.split(',')]
320
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
321
result.append((start, end, count, contents))
324
def get_fulltext_content(self, lines):
325
"""Extract just the content lines from a fulltext."""
326
return (line.split(' ', 1)[1] for line in lines)
328
def get_linedelta_content(self, lines):
329
"""Extract just the content from a line delta.
331
This doesn't return all of the extra information stored in a delta.
332
Only the actual content lines.
336
194
for header in lines:
337
header = header.split(',')
338
count = int(header[2])
339
for i in xrange(count):
195
start, end, count = [int(n) for n in header.split(',')]
340
199
origin, text = next().split(' ', 1)
201
contents.append((decode_utf8(origin), text))
202
result.append((start, end, count, contents))
343
205
def lower_fulltext(self, content):
344
206
"""convert a fulltext content record into a serializable form.
346
208
see parse_fulltext which this inverts.
348
# TODO: jam 20070209 We only do the caching thing to make sure that
349
# the origin is a valid utf-8 line, eventually we could remove it
350
return ['%s %s' % (o, t) for o, t in content._lines]
210
encode_utf8 = cache_utf8.encode
211
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
352
213
def lower_line_delta(self, delta):
353
214
"""convert a delta into a serializable form.
355
216
See parse_line_delta which this inverts.
357
# TODO: jam 20070209 We only do the caching thing to make sure that
358
# the origin is a valid utf-8 line, eventually we could remove it
218
encode_utf8 = cache_utf8.encode
360
220
for start, end, c, lines in delta:
361
221
out.append('%d,%d,%d\n' % (start, end, c))
362
out.extend(origin + ' ' + text
222
out.extend(encode_utf8(origin) + ' ' + text
363
223
for origin, text in lines)
366
def annotate_iter(self, knit, version_id):
367
content = knit._get_content(version_id)
368
return content.annotate_iter()
371
class KnitPlainFactory(object):
227
class KnitPlainFactory(_KnitFactory):
372
228
"""Factory for creating plain Content objects."""
374
230
annotated = False
376
def make(self, lines, version_id):
377
return PlainKnitContent(lines, version_id)
379
def parse_fulltext(self, content, version_id):
232
def parse_fulltext(self, content, version):
380
233
"""This parses an unannotated fulltext.
382
235
Note that this is not a noop - the internal representation
383
236
has (versionid, line) - its just a constant versionid.
385
return self.make(content, version_id)
238
return self.make(content, version)
387
def parse_line_delta_iter(self, lines, version_id):
389
num_lines = len(lines)
390
while cur < num_lines:
240
def parse_line_delta_iter(self, lines, version):
242
header = lines.pop(0)
393
243
start, end, c = [int(n) for n in header.split(',')]
394
yield start, end, c, lines[cur:cur+c]
397
def parse_line_delta(self, lines, version_id):
398
return list(self.parse_line_delta_iter(lines, version_id))
400
def get_fulltext_content(self, lines):
401
"""Extract just the content lines from a fulltext."""
404
def get_linedelta_content(self, lines):
405
"""Extract just the content from a line delta.
407
This doesn't return all of the extra information stored in a delta.
408
Only the actual content lines.
413
header = header.split(',')
414
count = int(header[2])
415
for i in xrange(count):
244
yield start, end, c, zip([version] * c, lines[:c])
247
def parse_line_delta(self, lines, version):
248
return list(self.parse_line_delta_iter(lines, version))
418
250
def lower_fulltext(self, content):
419
251
return content.text()
472
300
self.writable = (access_mode == 'w')
473
301
self.delta = delta
475
self._max_delta_chain = 200
478
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
479
access_mode, create=create, file_mode=file_mode,
480
create_parent_dir=create_parent_dir, delay_create=delay_create,
484
if access_method is None:
485
_access = _KnitAccess(transport, relpath + DATA_SUFFIX, file_mode, dir_mode,
486
((create and not len(self)) and delay_create), create_parent_dir)
488
_access = access_method
489
if create and not len(self) and not delay_create:
491
self._data = _KnitData(_access)
303
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
304
access_mode, create=create, file_mode=file_mode)
305
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
306
access_mode, create=create and not len(self), file_mode=file_mode)
493
308
def __repr__(self):
494
return '%s(%s)' % (self.__class__.__name__,
309
return '%s(%s)' % (self.__class__.__name__,
495
310
self.transport.abspath(self.filename))
497
def _check_should_delta(self, first_parents):
498
"""Iterate back through the parent listing, looking for a fulltext.
500
This is used when we want to decide whether to add a delta or a new
501
fulltext. It searches for _max_delta_chain parents. When it finds a
502
fulltext parent, it sees if the total size of the deltas leading up to
503
it is large enough to indicate that we want a new full text anyway.
505
Return True if we should create a new delta, False if we should use a
510
delta_parents = first_parents
511
for count in xrange(self._max_delta_chain):
512
parent = delta_parents[0]
513
method = self._index.get_method(parent)
514
index, pos, size = self._index.get_position(parent)
515
if method == 'fulltext':
519
delta_parents = self._index.get_parents(parent)
521
# We couldn't find a fulltext, so we must create a new one
524
return fulltext_size > delta_size
312
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
313
"""See VersionedFile._add_delta()."""
314
self._check_add(version_id, []) # should we check the lines ?
315
self._check_versions_present(parents)
319
for parent in parents:
320
if not self.has_version(parent):
321
ghosts.append(parent)
323
present_parents.append(parent)
325
if delta_parent is None:
326
# reconstitute as full text.
327
assert len(delta) == 1 or len(delta) == 0
329
assert delta[0][0] == 0
330
assert delta[0][1] == 0, delta[0][1]
331
return super(KnitVersionedFile, self)._add_delta(version_id,
342
options.append('no-eol')
344
if delta_parent is not None:
345
# determine the current delta chain length.
346
# To speed the extract of texts the delta chain is limited
347
# to a fixed number of deltas. This should minimize both
348
# I/O and the time spend applying deltas.
350
delta_parents = [delta_parent]
352
parent = delta_parents[0]
353
method = self._index.get_method(parent)
354
if method == 'fulltext':
356
delta_parents = self._index.get_parents(parent)
358
if method == 'line-delta':
359
# did not find a fulltext in the delta limit.
360
# just do a normal insertion.
361
return super(KnitVersionedFile, self)._add_delta(version_id,
368
options.append('line-delta')
369
store_lines = self.factory.lower_line_delta(delta)
371
where, size = self._data.add_record(version_id, digest, store_lines)
372
self._index.add_version(version_id, options, where, size, parents)
526
374
def _add_raw_records(self, records, data):
527
375
"""Add all the records 'records' with data pre-joined in 'data'.
556
403
"""See VersionedFile.copy_to()."""
557
404
# copy the current index to a temp index to avoid racing with local
559
transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',
560
self.transport.get(self._index._filename))
406
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)
561
407
# copy the data file
562
408
f = self._data._open_file()
564
transport.put_file(name + DATA_SUFFIX, f)
410
transport.put(name + DATA_SUFFIX, f)
567
413
# move the copied index into place
568
414
transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
570
416
def create_empty(self, name, transport, mode=None):
571
return KnitVersionedFile(name, transport, factory=self.factory,
572
delta=self.delta, create=True)
417
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
574
def get_data_stream(self, required_versions):
575
"""Get a data stream for the specified versions.
577
Versions may be returned in any order, not necessarily the order
580
:param required_versions: The exact set of versions to be extracted.
581
Unlike some other knit methods, this is not used to generate a
582
transitive closure, rather it is used precisely as given.
419
def _fix_parents(self, version, new_parents):
420
"""Fix the parents list for version.
584
:returns: format_signature, list of (version, options, length, parents),
422
This is done by appending a new version to the index
423
with identical data except for the parents list.
424
the parents list must be a superset of the current
587
if not isinstance(required_versions, set):
588
required_versions = set(required_versions)
589
# we don't care about inclusions, the caller cares.
590
# but we need to setup a list of records to visit.
591
for version_id in required_versions:
592
if not self.has_version(version_id):
593
raise RevisionNotPresent(version_id, self.filename)
594
# Pick the desired versions out of the index in oldest-to-newest order
596
for version_id in self.versions():
597
if version_id in required_versions:
598
version_list.append(version_id)
600
# create the list of version information for the result
601
copy_queue_records = []
603
result_version_list = []
604
for version_id in version_list:
605
options = self._index.get_options(version_id)
606
parents = self._index.get_parents_with_ghosts(version_id)
607
index_memo = self._index.get_position(version_id)
608
copy_queue_records.append((version_id, index_memo))
609
none, data_pos, data_size = index_memo
610
copy_set.add(version_id)
611
# version, options, length, parents
612
result_version_list.append((version_id, options, data_size,
615
# Read the compressed record data.
617
# From here down to the return should really be logic in the returned
618
# callable -- in a class that adapts read_records_iter_raw to read
621
for (version_id, raw_data), \
622
(version_id2, options, _, parents) in \
623
izip(self._data.read_records_iter_raw(copy_queue_records),
624
result_version_list):
625
assert version_id == version_id2, 'logic error, inconsistent results'
626
raw_datum.append(raw_data)
627
pseudo_file = StringIO(''.join(raw_datum))
630
return pseudo_file.read()
632
return pseudo_file.read(length)
633
return (self.get_format_signature(), result_version_list, read)
635
def _extract_blocks(self, version_id, source, target):
636
if self._index.get_method(version_id) != 'line-delta':
638
parent, sha1, noeol, delta = self.get_delta(version_id)
639
return KnitContent.get_line_delta_blocks(delta, source, target)
427
current_values = self._index._cache[version]
428
assert set(current_values[4]).difference(set(new_parents)) == set()
429
self._index.add_version(version,
641
435
def get_delta(self, version_id):
642
436
"""Get a delta for constructing version from some other version."""
643
self.check_not_reserved_id(version_id)
437
if not self.has_version(version_id):
438
raise RevisionNotPresent(version_id, self.filename)
644
440
parents = self.get_parents(version_id)
646
442
parent = parents[0]
649
index_memo = self._index.get_position(version_id)
650
data, sha1 = self._data.read_records(((version_id, index_memo),))[version_id]
445
data_pos, data_size = self._index.get_position(version_id)
446
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
447
version_idx = self._index.lookup(version_id)
651
448
noeol = 'no-eol' in self._index.get_options(version_id)
652
449
if 'fulltext' == self._index.get_method(version_id):
653
new_content = self.factory.parse_fulltext(data, version_id)
450
new_content = self.factory.parse_fulltext(data, version_idx)
654
451
if parent is not None:
655
452
reference_content = self._get_content(parent)
656
453
old_texts = reference_content.text()
659
456
new_texts = new_content.text()
660
delta_seq = patiencediff.PatienceSequenceMatcher(None, old_texts,
457
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
662
458
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
664
delta = self.factory.parse_line_delta(data, version_id)
460
delta = self.factory.parse_line_delta(data, version_idx)
665
461
return parent, sha1, noeol, delta
667
def get_format_signature(self):
668
"""See VersionedFile.get_format_signature()."""
669
if self.factory.annotated:
670
annotated_part = "annotated"
672
annotated_part = "plain"
673
return "knit-%s" % (annotated_part,)
675
463
def get_graph_with_ghosts(self):
676
464
"""See VersionedFile.get_graph_with_ghosts()."""
708
def insert_data_stream(self, (format, data_list, reader_callable)):
709
"""Insert knit records from a data stream into this knit.
711
If a version in the stream is already present in this knit, it will not
712
be inserted a second time. It will be checked for consistency with the
713
stored version however, and may cause a KnitCorrupt error to be raised
714
if the data in the stream disagrees with the already stored data.
716
:seealso: get_data_stream
718
if format != self.get_format_signature():
719
trace.mutter('incompatible format signature inserting to %r', self)
720
raise KnitDataStreamIncompatible(
721
format, self.get_format_signature())
723
for version_id, options, length, parents in data_list:
724
if self.has_version(version_id):
725
# First check: the list of parents.
726
my_parents = self.get_parents_with_ghosts(version_id)
727
if my_parents != parents:
728
# XXX: KnitCorrupt is not quite the right exception here.
731
'parents list %r from data stream does not match '
732
'already recorded parents %r for %s'
733
% (parents, my_parents, version_id))
735
# Also check the SHA-1 of the fulltext this content will
737
raw_data = reader_callable(length)
738
my_fulltext_sha1 = self.get_sha1(version_id)
739
df, rec = self._data._parse_record_header(version_id, raw_data)
740
stream_fulltext_sha1 = rec[3]
741
if my_fulltext_sha1 != stream_fulltext_sha1:
742
# Actually, we don't know if it's this knit that's corrupt,
743
# or the data stream we're trying to insert.
745
self.filename, 'sha-1 does not match %s' % version_id)
747
if 'line-delta' in options:
748
# Make sure that this knit record is actually useful: a
749
# line-delta is no use unless we have its parent.
750
# Fetching from a broken repository with this problem
751
# shouldn't break the target repository.
752
if not self._index.has_version(parents[0]):
755
'line-delta from stream references '
756
'missing parent %s' % parents[0])
757
self._add_raw_records(
758
[(version_id, options, parents, length)],
759
reader_callable(length))
761
493
def versions(self):
762
494
"""See VersionedFile.versions."""
763
if 'evil' in debug.debug_flags:
764
trace.mutter_callsite(2, "versions scales with size of history")
765
495
return self._index.get_versions()
767
497
def has_version(self, version_id):
768
498
"""See VersionedFile.has_version."""
769
if 'evil' in debug.debug_flags:
770
trace.mutter_callsite(2, "has_version is a LBYL scenario")
771
499
return self._index.has_version(version_id)
773
501
__contains__ = has_version
775
503
def _merge_annotations(self, content, parents, parent_texts={},
776
delta=None, annotated=None,
777
left_matching_blocks=None):
504
delta=None, annotated=None):
778
505
"""Merge annotations for content. This is done by comparing
779
506
the annotations based on changed to the text.
781
if left_matching_blocks is not None:
782
delta_seq = diff._PrematchedMatcher(left_matching_blocks)
786
510
for parent_id in parents:
787
511
merge_content = self._get_content(parent_id, parent_texts)
788
if (parent_id == parents[0] and delta_seq is not None):
791
seq = patiencediff.PatienceSequenceMatcher(
792
None, merge_content.text(), content.text())
512
seq = KnitSequenceMatcher(None, merge_content.text(), content.text())
513
if delta_seq is None:
514
# setup a delta seq to reuse.
793
516
for i, j, n in seq.get_matching_blocks():
796
# this appears to copy (origin, text) pairs across to the
797
# new content for any line that matches the last-checked
519
# this appears to copy (origin, text) pairs across to the new
520
# content for any line that matches the last-checked parent.
521
# FIXME: save the sequence control data for delta compression
522
# against the most relevant parent rather than rediffing.
799
523
content._lines[j:j+n] = merge_content._lines[i:i+n]
801
if delta_seq is None:
802
526
reference_content = self._get_content(parents[0], parent_texts)
803
527
new_texts = content.text()
804
528
old_texts = reference_content.text()
805
delta_seq = patiencediff.PatienceSequenceMatcher(
806
None, old_texts, new_texts)
529
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
807
530
return self._make_line_delta(delta_seq, content)
809
532
def _make_line_delta(self, delta_seq, new_content):
857
581
def _check_versions_present(self, version_ids):
858
582
"""Check that all specified versions are present."""
859
self._index.check_versions_present(version_ids)
583
version_ids = set(version_ids)
584
for r in list(version_ids):
585
if self._index.has_version(r):
586
version_ids.remove(r)
588
raise RevisionNotPresent(list(version_ids)[0], self.filename)
861
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,
862
nostore_sha, random_id, check_content):
590
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
863
591
"""See VersionedFile.add_lines_with_ghosts()."""
864
self._check_add(version_id, lines, random_id, check_content)
865
return self._add(version_id, lines, parents, self.delta,
866
parent_texts, None, nostore_sha, random_id)
592
self._check_add(version_id, lines)
593
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
868
def _add_lines(self, version_id, parents, lines, parent_texts,
869
left_matching_blocks, nostore_sha, random_id, check_content):
595
def _add_lines(self, version_id, parents, lines, parent_texts):
870
596
"""See VersionedFile.add_lines."""
871
self._check_add(version_id, lines, random_id, check_content)
597
self._check_add(version_id, lines)
872
598
self._check_versions_present(parents)
873
return self._add(version_id, lines[:], parents, self.delta,
874
parent_texts, left_matching_blocks, nostore_sha, random_id)
599
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
876
def _check_add(self, version_id, lines, random_id, check_content):
601
def _check_add(self, version_id, lines):
877
602
"""check that version_id and lines are safe to add."""
603
assert self.writable, "knit is not opened for write"
604
### FIXME escape. RBC 20060228
878
605
if contains_whitespace(version_id):
879
606
raise InvalidRevisionId(version_id, self.filename)
880
self.check_not_reserved_id(version_id)
881
# Technically this could be avoided if we are happy to allow duplicate
882
# id insertion when other things than bzr core insert texts, but it
883
# seems useful for folk using the knit api directly to have some safety
884
# blanket that we can disable.
885
if not random_id and self.has_version(version_id):
607
if self.has_version(version_id):
886
608
raise RevisionAlreadyPresent(version_id, self.filename)
888
self._check_lines_not_unicode(lines)
889
self._check_lines_are_lines(lines)
609
self._check_lines_not_unicode(lines)
610
self._check_lines_are_lines(lines)
891
def _add(self, version_id, lines, parents, delta, parent_texts,
892
left_matching_blocks, nostore_sha, random_id):
612
def _add(self, version_id, lines, parents, delta, parent_texts):
893
613
"""Add a set of lines on top of version specified by parents.
895
615
If delta is true, compress the text as a line-delta against
898
618
Any versions not present will be converted into ghosts.
900
# first thing, if the content is something we don't need to store, find
902
line_bytes = ''.join(lines)
903
digest = sha_string(line_bytes)
904
if nostore_sha == digest:
905
raise errors.ExistingContent
620
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
621
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
622
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
623
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
624
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
625
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
626
# +1383 0 8.0370 8.0370 +<len>
627
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
628
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
629
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
630
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
907
632
present_parents = []
908
634
if parent_texts is None:
909
635
parent_texts = {}
910
636
for parent in parents:
911
if self.has_version(parent):
637
if not self.has_version(parent):
638
ghosts.append(parent)
912
640
present_parents.append(parent)
914
# can only compress against the left most present parent.
916
(len(present_parents) == 0 or
917
present_parents[0] != parents[0])):
642
if delta and not len(present_parents):
920
text_length = len(line_bytes)
645
digest = sha_strings(lines)
923
648
if lines[-1][-1] != '\n':
924
# copy the contents of lines.
926
649
options.append('no-eol')
927
650
lines[-1] = lines[-1] + '\n'
652
if len(present_parents) and delta:
931
653
# To speed the extract of texts the delta chain is limited
932
654
# to a fixed number of deltas. This should minimize both
933
655
# I/O and the time spend applying deltas.
934
delta = self._check_should_delta(present_parents)
657
delta_parents = present_parents
659
parent = delta_parents[0]
660
method = self._index.get_method(parent)
661
if method == 'fulltext':
663
delta_parents = self._index.get_parents(parent)
665
if method == 'line-delta':
936
assert isinstance(version_id, str)
937
content = self.factory.make(lines, version_id)
668
lines = self.factory.make(lines, version_id)
938
669
if delta or (self.factory.annotated and len(present_parents) > 0):
939
# Merge annotations from parent texts if needed.
940
delta_hunks = self._merge_annotations(content, present_parents,
941
parent_texts, delta, self.factory.annotated,
942
left_matching_blocks)
670
# Merge annotations from parent texts if so is needed.
671
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
672
delta, self.factory.annotated)
945
675
options.append('line-delta')
946
676
store_lines = self.factory.lower_line_delta(delta_hunks)
947
size, bytes = self._data._record_to_data(version_id, digest,
950
678
options.append('fulltext')
951
# isinstance is slower and we have no hierarchy.
952
if self.factory.__class__ == KnitPlainFactory:
953
# Use the already joined bytes saving iteration time in
955
size, bytes = self._data._record_to_data(version_id, digest,
958
# get mixed annotation + content and feed it into the
960
store_lines = self.factory.lower_fulltext(content)
961
size, bytes = self._data._record_to_data(version_id, digest,
679
store_lines = self.factory.lower_fulltext(lines)
964
access_memo = self._data.add_raw_records([size], bytes)[0]
965
self._index.add_versions(
966
((version_id, options, access_memo, parents),),
968
return digest, text_length, content
681
where, size = self._data.add_record(version_id, digest, store_lines)
682
self._index.add_version(version_id, options, where, size, parents)
970
685
def check(self, progress_bar=None):
971
686
"""See VersionedFile.check()."""
1051
759
if component_id in content_map:
1052
760
content = content_map[component_id]
762
version_idx = self._index.lookup(component_id)
1054
763
if method == 'fulltext':
1055
764
assert content is None
1056
content = self.factory.parse_fulltext(data, version_id)
765
content = self.factory.parse_fulltext(data, version_idx)
1057
766
elif method == 'line-delta':
1058
delta = self.factory.parse_line_delta(data, version_id)
1059
if multiple_versions:
1060
# only doing this when we want multiple versions
1061
# output avoids list copies - which reference and
1062
# dereference many strings.
1063
content = content.copy()
1064
content.apply_delta(delta, version_id)
1065
if multiple_versions:
1066
content_map[component_id] = content
767
delta = self.factory.parse_line_delta(data[:],
769
content = content.copy()
770
content._lines = self._apply_delta(content._lines,
772
content_map[component_id] = content
1068
774
if 'no-eol' in self._index.get_options(version_id):
1069
if multiple_versions:
1070
content = content.copy()
1071
content.strip_last_line_newline()
775
content = content.copy()
776
line = content._lines[-1][1].rstrip('\n')
777
content._lines[-1] = (content._lines[-1][0], line)
1072
778
final_content[version_id] = content
1074
780
# digest here is the digest from the last applied component.
1075
781
text = content.text()
1076
actual_sha = sha_strings(text)
1077
if actual_sha != digest:
1078
raise KnitCorrupt(self.filename,
1080
'\n of reconstructed text does not match'
1082
'\n for version %s' %
1083
(actual_sha, digest, version_id))
1084
text_map[version_id] = text
1085
return text_map, final_content
1087
def iter_lines_added_or_present_in_versions(self, version_ids=None,
782
if sha_strings(text) != digest:
783
raise KnitCorrupt(self.filename,
784
'sha-1 does not match %s' % version_id)
786
text_map[version_id] = text
787
return text_map, final_content
789
def iter_lines_added_or_present_in_versions(self, version_ids=None):
1089
790
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
1090
791
if version_ids is None:
1091
792
version_ids = self.versions()
1093
pb = progress.DummyProgress()
1094
793
# we don't care about inclusions, the caller cares.
1095
794
# but we need to setup a list of records to visit.
1096
795
# we need version_id, position, length
1097
796
version_id_records = []
1098
requested_versions = set(version_ids)
797
requested_versions = list(version_ids)
1099
798
# filter for available versions
1100
799
for version_id in requested_versions:
1101
800
if not self.has_version(version_id):
1102
801
raise RevisionNotPresent(version_id, self.filename)
1103
802
# get a in-component-order queue:
1104
804
for version_id in self.versions():
1105
805
if version_id in requested_versions:
1106
index_memo = self._index.get_position(version_id)
1107
version_id_records.append((version_id, index_memo))
806
version_ids.append(version_id)
807
data_pos, length = self._index.get_position(version_id)
808
version_id_records.append((version_id, data_pos, length))
810
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1109
812
total = len(version_id_records)
1110
for version_idx, (version_id, data, sha_value) in \
1111
enumerate(self._data.read_records_iter(version_id_records)):
1112
pb.update('Walking content.', version_idx, total)
1113
method = self._index.get_method(version_id)
1115
assert method in ('fulltext', 'line-delta')
1116
if method == 'fulltext':
1117
line_iterator = self.factory.get_fulltext_content(data)
1119
line_iterator = self.factory.get_linedelta_content(data)
1120
for line in line_iterator:
1123
pb.update('Walking content.', total, total)
814
pb.update('Walking content.', count, total)
815
for version_id, data, sha_value in \
816
self._data.read_records_iter(version_id_records):
817
pb.update('Walking content.', count, total)
818
method = self._index.get_method(version_id)
819
version_idx = self._index.lookup(version_id)
820
assert method in ('fulltext', 'line-delta')
821
if method == 'fulltext':
822
content = self.factory.parse_fulltext(data, version_idx)
823
for line in content.text():
826
delta = self.factory.parse_line_delta(data, version_idx)
827
for start, end, count, lines in delta:
828
for origin, line in lines:
831
pb.update('Walking content.', total, total)
834
pb.update('Walking content.', total, total)
1125
def iter_parents(self, version_ids):
1126
"""Iterate through the parents for many version ids.
1128
:param version_ids: An iterable yielding version_ids.
1129
:return: An iterator that yields (version_id, parents). Requested
1130
version_ids not present in the versioned file are simply skipped.
1131
The order is undefined, allowing for different optimisations in
1132
the underlying implementation.
1134
return self._index.iter_parents(version_ids)
1136
838
def num_versions(self):
1137
839
"""See VersionedFile.num_versions()."""
1138
840
return self._index.num_versions()
1174
879
versions = [versions]
1175
880
if not versions:
882
self._check_versions_present(versions)
1177
883
return self._index.get_ancestry_with_ghosts(versions)
885
#@deprecated_method(zero_eight)
886
def walk(self, version_ids):
887
"""See VersionedFile.walk."""
888
# We take the short path here, and extract all relevant texts
889
# and put them in a weave and let that do all the work. Far
890
# from optimal, but is much simpler.
891
# FIXME RB 20060228 this really is inefficient!
892
from bzrlib.weave import Weave
894
w = Weave(self.filename)
895
ancestry = self.get_ancestry(version_ids)
896
sorted_graph = topo_sort(self._index.get_graph())
897
version_list = [vid for vid in sorted_graph if vid in ancestry]
899
for version_id in version_list:
900
lines = self.get_lines(version_id)
901
w.add_lines(version_id, self.get_parents(version_id), lines)
903
for lineno, insert_id, dset, line in w.walk(version_ids):
904
yield lineno, insert_id, dset, line
1179
906
def plan_merge(self, ver_a, ver_b):
1180
907
"""See VersionedFile.plan_merge."""
1181
ancestors_b = set(self.get_ancestry(ver_b, topo_sorted=False))
1182
ancestors_a = set(self.get_ancestry(ver_a, topo_sorted=False))
908
ancestors_b = set(self.get_ancestry(ver_b))
909
def status_a(revision, text):
910
if revision in ancestors_b:
911
return 'killed-b', text
915
ancestors_a = set(self.get_ancestry(ver_a))
916
def status_b(revision, text):
917
if revision in ancestors_a:
918
return 'killed-a', text
1183
922
annotated_a = self.annotate(ver_a)
1184
923
annotated_b = self.annotate(ver_b)
1185
return merge._plan_annotate_merge(annotated_a, annotated_b,
1186
ancestors_a, ancestors_b)
924
plain_a = [t for (a, t) in annotated_a]
925
plain_b = [t for (a, t) in annotated_b]
926
blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
929
for ai, bi, l in blocks:
930
# process all mismatched sections
931
# (last mismatched section is handled because blocks always
932
# includes a 0-length last block)
933
for revision, text in annotated_a[a_cur:ai]:
934
yield status_a(revision, text)
935
for revision, text in annotated_b[b_cur:bi]:
936
yield status_b(revision, text)
938
# and now the matched section
941
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
942
assert text_a == text_b
943
yield "unchanged", text_a
1189
946
class _KnitComponentFile(object):
1190
947
"""One of the files used to implement a knit database"""
1192
def __init__(self, transport, filename, mode, file_mode=None,
1193
create_parent_dir=False, dir_mode=None):
949
def __init__(self, transport, filename, mode, file_mode=None):
1194
950
self._transport = transport
1195
951
self._filename = filename
1196
952
self._mode = mode
1197
self._file_mode = file_mode
1198
self._dir_mode = dir_mode
1199
self._create_parent_dir = create_parent_dir
1200
self._need_to_create = False
953
self._file_mode=file_mode
1202
def _full_path(self):
1203
"""Return the full path to this file."""
1204
return self._transport.base + self._filename
955
def write_header(self):
956
if self._transport.append(self._filename, StringIO(self.HEADER),
957
mode=self._file_mode):
958
raise KnitCorrupt(self._filename, 'misaligned after writing header')
1206
960
def check_header(self, fp):
1207
961
line = fp.readline()
1209
# An empty file can actually be treated as though the file doesn't
1211
raise errors.NoSuchFile(self._full_path())
1212
962
if line != self.HEADER:
1213
raise KnitHeaderError(badline=line,
1214
filename=self._transport.abspath(self._filename))
963
raise KnitHeaderError(badline=line)
966
"""Commit is a nop."""
1216
968
def __repr__(self):
1217
969
return '%s(%s)' % (self.__class__.__name__, self._filename)
1289
1041
self._history.append(version_id)
1291
1043
index = self._cache[version_id][5]
1292
self._cache[version_id] = (version_id,
1044
self._cache[version_id] = (version_id,
1299
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1300
create_parent_dir=False, delay_create=False, dir_mode=None):
1301
_KnitComponentFile.__init__(self, transport, filename, mode,
1302
file_mode=file_mode,
1303
create_parent_dir=create_parent_dir,
1051
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1052
_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)
1305
1053
self._cache = {}
1306
1054
# position in _history is the 'official' index for a revision
1307
1055
# but the values may have come from a newer entry.
1308
1056
# so - wc -l of a knit index is != the number of unique names
1310
1058
self._history = []
1059
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1312
fp = self._transport.get(self._filename)
1314
# _load_data may raise NoSuchFile if the target knit is
1316
_load_data(self, fp)
1320
if mode != 'w' or not create:
1323
self._need_to_create = True
1064
pb.update('read knit index', count, total)
1065
fp = self._transport.get(self._filename)
1067
self.check_header(fp)
1068
# readlines reads the whole file at once:
1069
# bad for transports like http, good for local disk
1070
# we save 60 ms doing this one change (
1071
# from calling readline each time to calling
1073
# probably what we want for nice behaviour on
1074
# http is a incremental readlines that yields, or
1075
# a check for local vs non local indexes,
1076
for l in fp.readlines():
1078
if len(rec) < 5 or rec[-1] != ':':
1080
# FIXME: in the future we should determine if its a
1081
# short write - and ignore it
1082
# or a different failure, and raise. RBC 20060407
1086
#pb.update('read knit index', count, total)
1087
# See self._parse_parents
1089
for value in rec[4:-1]:
1091
# uncompressed reference
1092
parents.append(value[1:])
1094
# this is 15/4000ms faster than isinstance,
1096
# this function is called thousands of times a
1097
# second so small variations add up.
1098
assert value.__class__ is str
1099
parents.append(self._history[int(value)])
1100
# end self._parse_parents
1101
# self._cache_version(rec[0],
1102
# rec[1].split(','),
1106
# --- self._cache_version
1107
# only want the _history index to reference the 1st
1108
# index entry for version_id
1110
if version_id not in self._cache:
1111
index = len(self._history)
1112
self._history.append(version_id)
1114
index = self._cache[version_id][5]
1115
self._cache[version_id] = (version_id,
1121
# --- self._cache_version
1124
except NoSuchFile, e:
1125
if mode != 'w' or not create:
1129
pb.update('read knit index', total, total)
1132
def _parse_parents(self, compressed_parents):
1133
"""convert a list of string parent values into version ids.
1135
ints are looked up in the index.
1136
.FOO values are ghosts and converted in to FOO.
1138
NOTE: the function is retained here for clarity, and for possible
1139
use in partial index reads. However bulk processing now has
1140
it inlined in __init__ for inner-loop optimisation.
1143
for value in compressed_parents:
1144
if value[-1] == '.':
1145
# uncompressed reference
1146
result.append(value[1:])
1325
self._transport.put_bytes_non_atomic(
1326
self._filename, self.HEADER, mode=self._file_mode)
1148
# this is 15/4000ms faster than isinstance,
1149
# this function is called thousands of times a
1150
# second so small variations add up.
1151
assert value.__class__ is str
1152
result.append(self._history[int(value)])
1328
1155
def get_graph(self):
1329
"""Return a list of the node:parents lists from this knit index."""
1330
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1157
for version_id, index in self._cache.iteritems():
1158
graph.append((version_id, index[4]))
1332
def get_ancestry(self, versions, topo_sorted=True):
1161
def get_ancestry(self, versions):
1333
1162
"""See VersionedFile.get_ancestry."""
1334
1163
# get a graph of all the mentioned versions:
1336
1165
pending = set(versions)
1339
1167
version = pending.pop()
1168
parents = self._cache[version][4]
1169
# got the parents ok
1342
parents = [p for p in cache[version][4] if p in cache]
1344
raise RevisionNotPresent(version, self._filename)
1345
# if not completed and not a ghost
1346
pending.update([p for p in parents if p not in graph])
1171
parents = [parent for parent in parents if parent in self._cache]
1172
for parent in parents:
1173
# if not completed and not a ghost
1174
if parent not in graph:
1347
1176
graph[version] = parents
1350
1177
return topo_sort(graph.items())
1352
1179
def get_ancestry_with_ghosts(self, versions):
1353
1180
"""See VersionedFile.get_ancestry_with_ghosts."""
1354
1181
# get a graph of all the mentioned versions:
1355
self.check_versions_present(versions)
1358
1183
pending = set(versions)
1360
1185
version = pending.pop()
1362
parents = cache[version][4]
1187
parents = self._cache[version][4]
1363
1188
except KeyError:
1364
1189
# ghost, fake it
1365
1190
graph[version] = []
1368
pending.update([p for p in parents if p not in graph])
1193
# got the parents ok
1194
for parent in parents:
1195
if parent not in graph:
1369
1197
graph[version] = parents
1370
1198
return topo_sort(graph.items())
1372
def iter_parents(self, version_ids):
1373
"""Iterate through the parents for many version ids.
1375
:param version_ids: An iterable yielding version_ids.
1376
:return: An iterator that yields (version_id, parents). Requested
1377
version_ids not present in the versioned file are simply skipped.
1378
The order is undefined, allowing for different optimisations in
1379
the underlying implementation.
1381
for version_id in version_ids:
1383
yield version_id, tuple(self.get_parents(version_id))
1387
1200
def num_versions(self):
1388
1201
return len(self._history)
1390
1203
__len__ = num_versions
1392
1205
def get_versions(self):
1393
"""Get all the versions in the file. not topologically sorted."""
1394
1206
return self._history
1208
def idx_to_name(self, idx):
1209
return self._history[idx]
1211
def lookup(self, version_id):
1212
assert version_id in self._cache
1213
return self._cache[version_id][5]
1396
1215
def _version_list_to_index(self, versions):
1216
encode_utf8 = cache_utf8.encode
1397
1217
result_list = []
1399
1218
for version in versions:
1400
if version in cache:
1219
if version in self._cache:
1401
1220
# -- inlined lookup() --
1402
result_list.append(str(cache[version][5]))
1221
result_list.append(str(self._cache[version][5]))
1403
1222
# -- end lookup () --
1405
result_list.append('.' + version)
1224
result_list.append('.' + encode_utf8(version))
1406
1225
return ' '.join(result_list)
1408
def add_version(self, version_id, options, index_memo, parents):
1227
def add_version(self, version_id, options, pos, size, parents):
1409
1228
"""Add a version record to the index."""
1410
self.add_versions(((version_id, options, index_memo, parents),))
1229
self.add_versions(((version_id, options, pos, size, parents),))
1412
def add_versions(self, versions, random_id=False):
1231
def add_versions(self, versions):
1413
1232
"""Add multiple versions to the index.
1415
1234
:param versions: a list of tuples:
1416
1235
(version_id, options, pos, size, parents).
1417
:param random_id: If True the ids being added were randomly generated
1418
and no check for existence will be performed.
1421
orig_history = self._history[:]
1422
orig_cache = self._cache.copy()
1425
for version_id, options, (index, pos, size), parents in versions:
1426
line = "\n%s %s %s %s %s :" % (version_id,
1430
self._version_list_to_index(parents))
1431
assert isinstance(line, str), \
1432
'content must be utf-8 encoded: %r' % (line,)
1434
self._cache_version(version_id, options, pos, size, parents)
1435
if not self._need_to_create:
1436
self._transport.append_bytes(self._filename, ''.join(lines))
1439
sio.write(self.HEADER)
1440
sio.writelines(lines)
1442
self._transport.put_file_non_atomic(self._filename, sio,
1443
create_parent_dir=self._create_parent_dir,
1444
mode=self._file_mode,
1445
dir_mode=self._dir_mode)
1446
self._need_to_create = False
1448
# If any problems happen, restore the original values and re-raise
1449
self._history = orig_history
1450
self._cache = orig_cache
1238
encode_utf8 = cache_utf8.encode
1239
for version_id, options, pos, size, parents in versions:
1240
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1244
self._version_list_to_index(parents))
1245
assert isinstance(line, str), \
1246
'content must be utf-8 encoded: %r' % (line,)
1248
self._transport.append(self._filename, StringIO(''.join(lines)))
1249
# cache after writing, so that a failed write leads to missing cache
1250
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1251
# failure, reload the index or flush it or some such, to prevent
1252
# writing records that did complete twice.
1253
for version_id, options, pos, size, parents in versions:
1254
self._cache_version(version_id, options, pos, size, parents)
1453
1256
def has_version(self, version_id):
1454
1257
"""True if the version is in the index."""
1455
return version_id in self._cache
1258
return self._cache.has_key(version_id)
1457
1260
def get_position(self, version_id):
1458
"""Return details needed to access the version.
1460
.kndx indices do not support split-out data, so return None for the
1463
:return: a tuple (None, data position, size) to hand to the access
1464
logic to get the record.
1466
entry = self._cache[version_id]
1467
return None, entry[2], entry[3]
1261
"""Return data position and size of specified version."""
1262
return (self._cache[version_id][2], \
1263
self._cache[version_id][3])
1469
1265
def get_method(self, version_id):
1470
1266
"""Return compression method of specified version."""
1472
options = self._cache[version_id][1]
1474
raise RevisionNotPresent(version_id, self._filename)
1267
options = self._cache[version_id][1]
1475
1268
if 'fulltext' in options:
1476
1269
return 'fulltext'
1478
if 'line-delta' not in options:
1479
raise errors.KnitIndexUnknownMethod(self._full_path(), options)
1271
assert 'line-delta' in options
1480
1272
return 'line-delta'
1482
1274
def get_options(self, version_id):
1483
"""Return a string represention options.
1487
1275
return self._cache[version_id][1]
1489
1277
def get_parents(self, version_id):
1498
1286
def check_versions_present(self, version_ids):
1499
1287
"""Check that all specified versions are present."""
1501
for version_id in version_ids:
1502
if version_id not in cache:
1503
raise RevisionNotPresent(version_id, self._filename)
1506
class KnitGraphIndex(object):
1507
"""A knit index that builds on GraphIndex."""
1509
def __init__(self, graph_index, deltas=False, parents=True, add_callback=None):
1510
"""Construct a KnitGraphIndex on a graph_index.
1512
:param graph_index: An implementation of bzrlib.index.GraphIndex.
1513
:param deltas: Allow delta-compressed records.
1514
:param add_callback: If not None, allow additions to the index and call
1515
this callback with a list of added GraphIndex nodes:
1516
[(node, value, node_refs), ...]
1517
:param parents: If True, record knits parents, if not do not record
1520
self._graph_index = graph_index
1521
self._deltas = deltas
1522
self._add_callback = add_callback
1523
self._parents = parents
1524
if deltas and not parents:
1525
raise KnitCorrupt(self, "Cannot do delta compression without "
1528
def _get_entries(self, keys, check_present=False):
1529
"""Get the entries for keys.
1531
:param keys: An iterable of index keys, - 1-tuples.
1536
for node in self._graph_index.iter_entries(keys):
1538
found_keys.add(node[1])
1540
# adapt parentless index to the rest of the code.
1541
for node in self._graph_index.iter_entries(keys):
1542
yield node[0], node[1], node[2], ()
1543
found_keys.add(node[1])
1545
missing_keys = keys.difference(found_keys)
1547
raise RevisionNotPresent(missing_keys.pop(), self)
1549
def _present_keys(self, version_ids):
1551
node[1] for node in self._get_entries(version_ids)])
1553
def _parentless_ancestry(self, versions):
1554
"""Honour the get_ancestry API for parentless knit indices."""
1555
wanted_keys = self._version_ids_to_keys(versions)
1556
present_keys = self._present_keys(wanted_keys)
1557
missing = set(wanted_keys).difference(present_keys)
1559
raise RevisionNotPresent(missing.pop(), self)
1560
return list(self._keys_to_version_ids(present_keys))
1562
def get_ancestry(self, versions, topo_sorted=True):
1563
"""See VersionedFile.get_ancestry."""
1564
if not self._parents:
1565
return self._parentless_ancestry(versions)
1566
# XXX: This will do len(history) index calls - perhaps
1567
# it should be altered to be a index core feature?
1568
# get a graph of all the mentioned versions:
1571
versions = self._version_ids_to_keys(versions)
1572
pending = set(versions)
1574
# get all pending nodes
1575
this_iteration = pending
1576
new_nodes = self._get_entries(this_iteration)
1579
for (index, key, value, node_refs) in new_nodes:
1580
# dont ask for ghosties - otherwise
1581
# we we can end up looping with pending
1582
# being entirely ghosted.
1583
graph[key] = [parent for parent in node_refs[0]
1584
if parent not in ghosts]
1586
for parent in graph[key]:
1587
# dont examine known nodes again
1592
ghosts.update(this_iteration.difference(found))
1593
if versions.difference(graph):
1594
raise RevisionNotPresent(versions.difference(graph).pop(), self)
1596
result_keys = topo_sort(graph.items())
1598
result_keys = graph.iterkeys()
1599
return [key[0] for key in result_keys]
1601
def get_ancestry_with_ghosts(self, versions):
1602
"""See VersionedFile.get_ancestry."""
1603
if not self._parents:
1604
return self._parentless_ancestry(versions)
1605
# XXX: This will do len(history) index calls - perhaps
1606
# it should be altered to be a index core feature?
1607
# get a graph of all the mentioned versions:
1609
versions = self._version_ids_to_keys(versions)
1610
pending = set(versions)
1612
# get all pending nodes
1613
this_iteration = pending
1614
new_nodes = self._get_entries(this_iteration)
1616
for (index, key, value, node_refs) in new_nodes:
1617
graph[key] = node_refs[0]
1619
for parent in graph[key]:
1620
# dont examine known nodes again
1624
missing_versions = this_iteration.difference(graph)
1625
missing_needed = versions.intersection(missing_versions)
1627
raise RevisionNotPresent(missing_needed.pop(), self)
1628
for missing_version in missing_versions:
1629
# add a key, no parents
1630
graph[missing_version] = []
1631
pending.discard(missing_version) # don't look for it
1632
result_keys = topo_sort(graph.items())
1633
return [key[0] for key in result_keys]
1635
def get_graph(self):
1636
"""Return a list of the node:parents lists from this knit index."""
1637
if not self._parents:
1638
return [(key, ()) for key in self.get_versions()]
1640
for index, key, value, refs in self._graph_index.iter_all_entries():
1641
result.append((key[0], tuple([ref[0] for ref in refs[0]])))
1644
def iter_parents(self, version_ids):
1645
"""Iterate through the parents for many version ids.
1647
:param version_ids: An iterable yielding version_ids.
1648
:return: An iterator that yields (version_id, parents). Requested
1649
version_ids not present in the versioned file are simply skipped.
1650
The order is undefined, allowing for different optimisations in
1651
the underlying implementation.
1654
all_nodes = set(self._get_entries(self._version_ids_to_keys(version_ids)))
1656
present_parents = set()
1657
for node in all_nodes:
1658
all_parents.update(node[3][0])
1659
# any node we are querying must be present
1660
present_parents.add(node[1])
1661
unknown_parents = all_parents.difference(present_parents)
1662
present_parents.update(self._present_keys(unknown_parents))
1663
for node in all_nodes:
1665
for parent in node[3][0]:
1666
if parent in present_parents:
1667
parents.append(parent[0])
1668
yield node[1][0], tuple(parents)
1670
for node in self._get_entries(self._version_ids_to_keys(version_ids)):
1671
yield node[1][0], ()
1673
def num_versions(self):
1674
return len(list(self._graph_index.iter_all_entries()))
1676
__len__ = num_versions
1678
def get_versions(self):
1679
"""Get all the versions in the file. not topologically sorted."""
1680
return [node[1][0] for node in self._graph_index.iter_all_entries()]
1682
def has_version(self, version_id):
1683
"""True if the version is in the index."""
1684
return len(self._present_keys(self._version_ids_to_keys([version_id]))) == 1
1686
def _keys_to_version_ids(self, keys):
1687
return tuple(key[0] for key in keys)
1689
def get_position(self, version_id):
1690
"""Return details needed to access the version.
1692
:return: a tuple (index, data position, size) to hand to the access
1693
logic to get the record.
1695
node = self._get_node(version_id)
1696
bits = node[2][1:].split(' ')
1697
return node[0], int(bits[0]), int(bits[1])
1699
def get_method(self, version_id):
1700
"""Return compression method of specified version."""
1701
if not self._deltas:
1703
return self._parent_compression(self._get_node(version_id)[3][1])
1705
def _parent_compression(self, reference_list):
1706
# use the second reference list to decide if this is delta'd or not.
1707
if len(reference_list):
1712
def _get_node(self, version_id):
1714
return list(self._get_entries(self._version_ids_to_keys([version_id])))[0]
1716
raise RevisionNotPresent(version_id, self)
1718
def get_options(self, version_id):
1719
"""Return a string represention options.
1723
node = self._get_node(version_id)
1724
if not self._deltas:
1725
options = ['fulltext']
1727
options = [self._parent_compression(node[3][1])]
1728
if node[2][0] == 'N':
1729
options.append('no-eol')
1732
def get_parents(self, version_id):
1733
"""Return parents of specified version ignoring ghosts."""
1734
parents = list(self.iter_parents([version_id]))
1737
raise errors.RevisionNotPresent(version_id, self)
1738
return parents[0][1]
1740
def get_parents_with_ghosts(self, version_id):
1741
"""Return parents of specified version with ghosts."""
1742
nodes = list(self._get_entries(self._version_ids_to_keys([version_id]),
1743
check_present=True))
1744
if not self._parents:
1746
return self._keys_to_version_ids(nodes[0][3][0])
1748
def check_versions_present(self, version_ids):
1749
"""Check that all specified versions are present."""
1750
keys = self._version_ids_to_keys(version_ids)
1751
present = self._present_keys(keys)
1752
missing = keys.difference(present)
1754
raise RevisionNotPresent(missing.pop(), self)
1756
def add_version(self, version_id, options, access_memo, parents):
1757
"""Add a version record to the index."""
1758
return self.add_versions(((version_id, options, access_memo, parents),))
1760
def add_versions(self, versions, random_id=False):
1761
"""Add multiple versions to the index.
1763
This function does not insert data into the Immutable GraphIndex
1764
backing the KnitGraphIndex, instead it prepares data for insertion by
1765
the caller and checks that it is safe to insert then calls
1766
self._add_callback with the prepared GraphIndex nodes.
1768
:param versions: a list of tuples:
1769
(version_id, options, pos, size, parents).
1770
:param random_id: If True the ids being added were randomly generated
1771
and no check for existence will be performed.
1773
if not self._add_callback:
1774
raise errors.ReadOnlyError(self)
1775
# we hope there are no repositories with inconsistent parentage
1780
for (version_id, options, access_memo, parents) in versions:
1781
index, pos, size = access_memo
1782
key = (version_id, )
1783
parents = tuple((parent, ) for parent in parents)
1784
if 'no-eol' in options:
1788
value += "%d %d" % (pos, size)
1789
if not self._deltas:
1790
if 'line-delta' in options:
1791
raise KnitCorrupt(self, "attempt to add line-delta in non-delta knit")
1794
if 'line-delta' in options:
1795
node_refs = (parents, (parents[0],))
1797
node_refs = (parents, ())
1799
node_refs = (parents, )
1802
raise KnitCorrupt(self, "attempt to add node with parents "
1803
"in parentless index.")
1805
keys[key] = (value, node_refs)
1807
present_nodes = self._get_entries(keys)
1808
for (index, key, value, node_refs) in present_nodes:
1809
if (value, node_refs) != keys[key]:
1810
raise KnitCorrupt(self, "inconsistent details in add_versions"
1811
": %s %s" % ((value, node_refs), keys[key]))
1815
for key, (value, node_refs) in keys.iteritems():
1816
result.append((key, value, node_refs))
1818
for key, (value, node_refs) in keys.iteritems():
1819
result.append((key, value))
1820
self._add_callback(result)
1822
def _version_ids_to_keys(self, version_ids):
1823
return set((version_id, ) for version_id in version_ids)
1826
class _KnitAccess(object):
1827
"""Access to knit records in a .knit file."""
1829
def __init__(self, transport, filename, _file_mode, _dir_mode,
1830
_need_to_create, _create_parent_dir):
1831
"""Create a _KnitAccess for accessing and inserting data.
1833
:param transport: The transport the .knit is located on.
1834
:param filename: The filename of the .knit.
1836
self._transport = transport
1837
self._filename = filename
1838
self._file_mode = _file_mode
1839
self._dir_mode = _dir_mode
1840
self._need_to_create = _need_to_create
1841
self._create_parent_dir = _create_parent_dir
1843
def add_raw_records(self, sizes, raw_data):
1844
"""Add raw knit bytes to a storage area.
1846
The data is spooled to whereever the access method is storing data.
1848
:param sizes: An iterable containing the size of each raw data segment.
1849
:param raw_data: A bytestring containing the data.
1850
:return: A list of memos to retrieve the record later. Each memo is a
1851
tuple - (index, pos, length), where the index field is always None
1852
for the .knit access method.
1854
assert type(raw_data) == str, \
1855
'data must be plain bytes was %s' % type(raw_data)
1856
if not self._need_to_create:
1857
base = self._transport.append_bytes(self._filename, raw_data)
1859
self._transport.put_bytes_non_atomic(self._filename, raw_data,
1860
create_parent_dir=self._create_parent_dir,
1861
mode=self._file_mode,
1862
dir_mode=self._dir_mode)
1863
self._need_to_create = False
1867
result.append((None, base, size))
1872
"""IFF this data access has its own storage area, initialise it.
1876
self._transport.put_bytes_non_atomic(self._filename, '',
1877
mode=self._file_mode)
1879
def open_file(self):
1880
"""IFF this data access can be represented as a single file, open it.
1882
For knits that are not mapped to a single file on disk this will
1885
:return: None or a file handle.
1888
return self._transport.get(self._filename)
1893
def get_raw_records(self, memos_for_retrieval):
1894
"""Get the raw bytes for a records.
1896
:param memos_for_retrieval: An iterable containing the (index, pos,
1897
length) memo for retrieving the bytes. The .knit method ignores
1898
the index as there is always only a single file.
1899
:return: An iterator over the bytes of the records.
1901
read_vector = [(pos, size) for (index, pos, size) in memos_for_retrieval]
1902
for pos, data in self._transport.readv(self._filename, read_vector):
1906
class _PackAccess(object):
1907
"""Access to knit records via a collection of packs."""
1909
def __init__(self, index_to_packs, writer=None):
1910
"""Create a _PackAccess object.
1912
:param index_to_packs: A dict mapping index objects to the transport
1913
and file names for obtaining data.
1914
:param writer: A tuple (pack.ContainerWriter, write_index) which
1915
contains the pack to write, and the index that reads from it will
1919
self.container_writer = writer[0]
1920
self.write_index = writer[1]
1922
self.container_writer = None
1923
self.write_index = None
1924
self.indices = index_to_packs
1926
def add_raw_records(self, sizes, raw_data):
1927
"""Add raw knit bytes to a storage area.
1929
The data is spooled to the container writer in one bytes-record per
1932
:param sizes: An iterable containing the size of each raw data segment.
1933
:param raw_data: A bytestring containing the data.
1934
:return: A list of memos to retrieve the record later. Each memo is a
1935
tuple - (index, pos, length), where the index field is the
1936
write_index object supplied to the PackAccess object.
1938
assert type(raw_data) == str, \
1939
'data must be plain bytes was %s' % type(raw_data)
1943
p_offset, p_length = self.container_writer.add_bytes_record(
1944
raw_data[offset:offset+size], [])
1946
result.append((self.write_index, p_offset, p_length))
1950
"""Pack based knits do not get individually created."""
1952
def get_raw_records(self, memos_for_retrieval):
1953
"""Get the raw bytes for a records.
1955
:param memos_for_retrieval: An iterable containing the (index, pos,
1956
length) memo for retrieving the bytes. The Pack access method
1957
looks up the pack to use for a given record in its index_to_pack
1959
:return: An iterator over the bytes of the records.
1961
# first pass, group into same-index requests
1963
current_index = None
1964
for (index, offset, length) in memos_for_retrieval:
1965
if current_index == index:
1966
current_list.append((offset, length))
1968
if current_index is not None:
1969
request_lists.append((current_index, current_list))
1970
current_index = index
1971
current_list = [(offset, length)]
1972
# handle the last entry
1973
if current_index is not None:
1974
request_lists.append((current_index, current_list))
1975
for index, offsets in request_lists:
1976
transport, path = self.indices[index]
1977
reader = pack.make_readv_reader(transport, path, offsets)
1978
for names, read_func in reader.iter_records():
1979
yield read_func(None)
1981
def open_file(self):
1982
"""Pack based knits have no single file."""
1985
def set_writer(self, writer, index, (transport, packname)):
1986
"""Set a writer to use for adding data."""
1987
if index is not None:
1988
self.indices[index] = (transport, packname)
1989
self.container_writer = writer
1990
self.write_index = index
1993
class _KnitData(object):
1994
"""Manage extraction of data from a KnitAccess, caching and decompressing.
1996
The KnitData class provides the logic for parsing and using knit records,
1997
making use of an access method for the low level read and write operations.
2000
def __init__(self, access):
2001
"""Create a KnitData object.
2003
:param access: The access method to use. Access methods such as
2004
_KnitAccess manage the insertion of raw records and the subsequent
2005
retrieval of the same.
2007
self._access = access
1288
version_ids = set(version_ids)
1289
for version_id in list(version_ids):
1290
if version_id in self._cache:
1291
version_ids.remove(version_id)
1293
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1296
class _KnitData(_KnitComponentFile):
1297
"""Contents of the knit data file"""
1299
HEADER = "# bzr knit data 8\n"
1301
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1302
_KnitComponentFile.__init__(self, transport, filename, mode)
2008
1303
self._checked = False
2009
1304
# TODO: jam 20060713 conceptually, this could spill to disk
2010
1305
# if the cached size gets larger than a certain amount
2387
1697
InterVersionedFile.register_optimiser(WeaveToKnit)
2390
# Deprecated, use PatienceSequenceMatcher instead
2391
KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher
2394
def annotate_knit(knit, revision_id):
2395
"""Annotate a knit with no cached annotations.
2397
This implementation is for knits with no cached annotations.
2398
It will work for knits with cached annotations, but this is not
1700
class KnitSequenceMatcher(difflib.SequenceMatcher):
1701
"""Knit tuned sequence matcher.
1703
This is based on profiling of difflib which indicated some improvements
1704
for our usage pattern.
2401
ancestry = knit.get_ancestry(revision_id)
2402
fulltext = dict(zip(ancestry, knit.get_line_list(ancestry)))
2404
for candidate in ancestry:
2405
if candidate in annotations:
2407
parents = knit.get_parents(candidate)
2408
if len(parents) == 0:
2410
elif knit._index.get_method(candidate) != 'line-delta':
2413
parent, sha1, noeol, delta = knit.get_delta(candidate)
2414
blocks = KnitContent.get_line_delta_blocks(delta,
2415
fulltext[parents[0]], fulltext[candidate])
2416
annotations[candidate] = list(annotate.reannotate([annotations[p]
2417
for p in parents], fulltext[candidate], candidate, blocks))
2418
return iter(annotations[revision_id])
2422
from bzrlib._knit_load_data_c import _load_data_c as _load_data
2424
from bzrlib._knit_load_data_py import _load_data_py as _load_data
1707
def find_longest_match(self, alo, ahi, blo, bhi):
1708
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1710
If isjunk is not defined:
1712
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1713
alo <= i <= i+k <= ahi
1714
blo <= j <= j+k <= bhi
1715
and for all (i',j',k') meeting those conditions,
1718
and if i == i', j <= j'
1720
In other words, of all maximal matching blocks, return one that
1721
starts earliest in a, and of all those maximal matching blocks that
1722
start earliest in a, return the one that starts earliest in b.
1724
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1725
>>> s.find_longest_match(0, 5, 0, 9)
1728
If isjunk is defined, first the longest matching block is
1729
determined as above, but with the additional restriction that no
1730
junk element appears in the block. Then that block is extended as
1731
far as possible by matching (only) junk elements on both sides. So
1732
the resulting block never matches on junk except as identical junk
1733
happens to be adjacent to an "interesting" match.
1735
Here's the same example as before, but considering blanks to be
1736
junk. That prevents " abcd" from matching the " abcd" at the tail
1737
end of the second sequence directly. Instead only the "abcd" can
1738
match, and matches the leftmost "abcd" in the second sequence:
1740
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1741
>>> s.find_longest_match(0, 5, 0, 9)
1744
If no blocks match, return (alo, blo, 0).
1746
>>> s = SequenceMatcher(None, "ab", "c")
1747
>>> s.find_longest_match(0, 2, 0, 1)
1751
# CAUTION: stripping common prefix or suffix would be incorrect.
1755
# Longest matching block is "ab", but if common prefix is
1756
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1757
# strip, so ends up claiming that ab is changed to acab by
1758
# inserting "ca" in the middle. That's minimal but unintuitive:
1759
# "it's obvious" that someone inserted "ac" at the front.
1760
# Windiff ends up at the same place as diff, but by pairing up
1761
# the unique 'b's and then matching the first two 'a's.
1763
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1764
besti, bestj, bestsize = alo, blo, 0
1765
# find longest junk-free match
1766
# during an iteration of the loop, j2len[j] = length of longest
1767
# junk-free match ending with a[i-1] and b[j]
1771
for i in xrange(alo, ahi):
1772
# look at all instances of a[i] in b; note that because
1773
# b2j has no junk keys, the loop is skipped if a[i] is junk
1774
j2lenget = j2len.get
1777
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1778
# following improvement
1779
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1780
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1781
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1783
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1784
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1785
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1797
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1799
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1802
# Extend the best by non-junk elements on each end. In particular,
1803
# "popular" non-junk elements aren't in b2j, which greatly speeds
1804
# the inner loop above, but also means "the best" match so far
1805
# doesn't contain any junk *or* popular non-junk elements.
1806
while besti > alo and bestj > blo and \
1807
not isbjunk(b[bestj-1]) and \
1808
a[besti-1] == b[bestj-1]:
1809
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1810
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1811
not isbjunk(b[bestj+bestsize]) and \
1812
a[besti+bestsize] == b[bestj+bestsize]:
1815
# Now that we have a wholly interesting match (albeit possibly
1816
# empty!), we may as well suck up the matching junk on each
1817
# side of it too. Can't think of a good reason not to, and it
1818
# saves post-processing the (possibly considerable) expense of
1819
# figuring out what to do with it. In the case of an empty
1820
# interesting match, this is clearly the right thing to do,
1821
# because no other kind of match is possible in the regions.
1822
while besti > alo and bestj > blo and \
1823
isbjunk(b[bestj-1]) and \
1824
a[besti-1] == b[bestj-1]:
1825
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1826
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1827
isbjunk(b[bestj+bestsize]) and \
1828
a[besti+bestsize] == b[bestj+bestsize]:
1829
bestsize = bestsize + 1
1831
return besti, bestj, bestsize