1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
5
# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
"""Knit versionedfile implementation.
23
A knit is a versioned file implementation that supports efficient append only
27
lifeless: the data file is made up of "delta records". each delta record has a delta header
28
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
29
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
30
end-marker; simply "end VERSION"
32
delta can be line or full contents.a
33
... the 8's there are the index number of the annotation.
34
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
38
8 e.set('executable', 'yes')
40
8 if elt.get('executable') == 'yes':
41
8 ie.executable = True
42
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
46
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
47
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
48
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
49
09:33 < lifeless> right
50
09:33 < jrydberg> lifeless: the position and size is the range in the data file
53
so the index sequence is the dictionary compressed sequence number used
54
in the deltas to provide line annotation
59
# 10:16 < lifeless> make partial index writes safe
60
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
61
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
63
# move sha1 out of the content so that join is faster at verifying parents
64
# record content length ?
68
from cStringIO import StringIO
70
from itertools import izip, chain
76
import bzrlib.errors as errors
77
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
78
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
79
RevisionNotPresent, RevisionAlreadyPresent
80
from bzrlib.tuned_gzip import GzipFile
81
from bzrlib.trace import mutter
82
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
84
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
85
from bzrlib.tsort import topo_sort
89
# TODO: Split out code specific to this format into an associated object.
91
# TODO: Can we put in some kind of value to check that the index and data
92
# files belong together?
94
# TODO: accommodate binaries, perhaps by storing a byte count
96
# TODO: function to check whole file
98
# TODO: atomically append data, then measure backwards from the cursor
99
# position after writing to work out where it was located. we may need to
100
# bypass python file buffering.
102
DATA_SUFFIX = '.knit'
103
INDEX_SUFFIX = '.kndx'
106
class KnitContent(object):
107
"""Content of a knit version to which deltas can be applied."""
109
def __init__(self, lines):
112
def annotate_iter(self):
113
"""Yield tuples of (origin, text) for each content line."""
114
for origin, text in self._lines:
118
"""Return a list of (origin, text) tuples."""
119
return list(self.annotate_iter())
121
def line_delta_iter(self, new_lines):
122
"""Generate line-based delta from this content to new_lines."""
123
new_texts = [text for origin, text in new_lines._lines]
124
old_texts = [text for origin, text in self._lines]
125
s = KnitSequenceMatcher(None, old_texts, new_texts)
126
for op in s.get_opcodes():
129
# ofrom oto length data
130
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
132
def line_delta(self, new_lines):
133
return list(self.line_delta_iter(new_lines))
136
return [text for origin, text in self._lines]
139
class _KnitFactory(object):
140
"""Base factory for creating content objects."""
142
def make(self, lines, version):
143
num_lines = len(lines)
144
return KnitContent(zip([version] * num_lines, lines))
147
class KnitAnnotateFactory(_KnitFactory):
148
"""Factory for creating annotated Content objects."""
152
def parse_fulltext(self, content, version):
153
"""Convert fulltext to internal representation
155
fulltext content is of the format
156
revid(utf8) plaintext\n
157
internal representation is of the format:
162
origin, text = line.split(' ', 1)
163
lines.append((origin.decode('utf-8'), text))
164
return KnitContent(lines)
166
def parse_line_delta_iter(self, lines):
167
for result_item in self.parse_line_delta[lines]:
170
def parse_line_delta(self, lines, version):
171
"""Convert a line based delta into internal representation.
173
line delta is in the form of:
174
intstart intend intcount
176
revid(utf8) newline\n
177
internal representation is
178
(start, end, count, [1..count tuples (revid, newline)])
183
# walk through the lines parsing.
185
start, end, count = [int(n) for n in header.split(',')]
189
origin, text = next().split(' ', 1)
191
contents.append((origin.decode('utf-8'), text))
192
result.append((start, end, count, contents))
195
def lower_fulltext(self, content):
196
"""convert a fulltext content record into a serializable form.
198
see parse_fulltext which this inverts.
200
return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]
202
def lower_line_delta(self, delta):
203
"""convert a delta into a serializable form.
205
See parse_line_delta which this inverts.
208
for start, end, c, lines in delta:
209
out.append('%d,%d,%d\n' % (start, end, c))
210
for origin, text in lines:
211
out.append('%s %s' % (origin.encode('utf-8'), text))
215
class KnitPlainFactory(_KnitFactory):
216
"""Factory for creating plain Content objects."""
220
def parse_fulltext(self, content, version):
221
"""This parses an unannotated fulltext.
223
Note that this is not a noop - the internal representation
224
has (versionid, line) - its just a constant versionid.
226
return self.make(content, version)
228
def parse_line_delta_iter(self, lines, version):
230
header = lines.pop(0)
231
start, end, c = [int(n) for n in header.split(',')]
232
yield start, end, c, zip([version] * c, lines[:c])
235
def parse_line_delta(self, lines, version):
236
return list(self.parse_line_delta_iter(lines, version))
238
def lower_fulltext(self, content):
239
return content.text()
241
def lower_line_delta(self, delta):
243
for start, end, c, lines in delta:
244
out.append('%d,%d,%d\n' % (start, end, c))
245
out.extend([text for origin, text in lines])
249
def make_empty_knit(transport, relpath):
250
"""Construct a empty knit at the specified location."""
251
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
255
class KnitVersionedFile(VersionedFile):
256
"""Weave-like structure with faster random access.
258
A knit stores a number of texts and a summary of the relationships
259
between them. Texts are identified by a string version-id. Texts
260
are normally stored and retrieved as a series of lines, but can
261
also be passed as single strings.
263
Lines are stored with the trailing newline (if any) included, to
264
avoid special cases for files with no final newline. Lines are
265
composed of 8-bit characters, not unicode. The combination of
266
these approaches should mean any 'binary' file can be safely
267
stored and retrieved.
270
def __init__(self, relpath, transport, file_mode=None, access_mode=None,
271
factory=None, basis_knit=None, delta=True, create=False):
272
"""Construct a knit at location specified by relpath.
274
:param create: If not True, only open an existing knit.
276
if access_mode is None:
278
super(KnitVersionedFile, self).__init__(access_mode)
279
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
280
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
283
self.transport = transport
284
self.filename = relpath
285
self.basis_knit = basis_knit
286
self.factory = factory or KnitAnnotateFactory()
287
self.writable = (access_mode == 'w')
290
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
291
access_mode, create=create, file_mode=file_mode)
292
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
293
access_mode, create=create and not len(self), file_mode=file_mode)
296
return '%s(%s)' % (self.__class__.__name__,
297
self.transport.abspath(self.filename))
299
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
300
"""See VersionedFile._add_delta()."""
301
self._check_add(version_id, []) # should we check the lines ?
302
self._check_versions_present(parents)
306
for parent in parents:
307
if not self.has_version(parent):
308
ghosts.append(parent)
310
present_parents.append(parent)
312
if delta_parent is None:
313
# reconstitute as full text.
314
assert len(delta) == 1 or len(delta) == 0
316
assert delta[0][0] == 0
317
assert delta[0][1] == 0, delta[0][1]
318
return super(KnitVersionedFile, self)._add_delta(version_id,
329
options.append('no-eol')
331
if delta_parent is not None:
332
# determine the current delta chain length.
333
# To speed the extract of texts the delta chain is limited
334
# to a fixed number of deltas. This should minimize both
335
# I/O and the time spend applying deltas.
337
delta_parents = [delta_parent]
339
parent = delta_parents[0]
340
method = self._index.get_method(parent)
341
if method == 'fulltext':
343
delta_parents = self._index.get_parents(parent)
345
if method == 'line-delta':
346
# did not find a fulltext in the delta limit.
347
# just do a normal insertion.
348
return super(KnitVersionedFile, self)._add_delta(version_id,
355
options.append('line-delta')
356
store_lines = self.factory.lower_line_delta(delta)
358
where, size = self._data.add_record(version_id, digest, store_lines)
359
self._index.add_version(version_id, options, where, size, parents)
361
def _add_raw_records(self, records, data):
362
"""Add all the records 'records' with data pre-joined in 'data'.
364
:param records: A list of tuples(version_id, options, parents, size).
365
:param data: The data for the records. When it is written, the records
366
are adjusted to have pos pointing into data by the sum of
367
the preceding records sizes.
370
pos = self._data.add_raw_record(data)
372
for (version_id, options, parents, size) in records:
373
index_entries.append((version_id, options, pos, size, parents))
375
self._index.add_versions(index_entries)
377
def clear_cache(self):
378
"""Clear the data cache only."""
379
self._data.clear_cache()
381
def copy_to(self, name, transport):
382
"""See VersionedFile.copy_to()."""
383
# copy the current index to a temp index to avoid racing with local
385
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)
387
transport.put(name + DATA_SUFFIX, self._data._open_file())
388
# rename the copied index into place
389
transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
391
def create_empty(self, name, transport, mode=None):
392
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
394
def _fix_parents(self, version, new_parents):
395
"""Fix the parents list for version.
397
This is done by appending a new version to the index
398
with identical data except for the parents list.
399
the parents list must be a superset of the current
402
current_values = self._index._cache[version]
403
assert set(current_values[4]).difference(set(new_parents)) == set()
404
self._index.add_version(version,
410
def get_delta(self, version_id):
411
"""Get a delta for constructing version from some other version."""
412
if not self.has_version(version_id):
413
raise RevisionNotPresent(version_id, self.filename)
415
parents = self.get_parents(version_id)
420
data_pos, data_size = self._index.get_position(version_id)
421
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
422
version_idx = self._index.lookup(version_id)
423
noeol = 'no-eol' in self._index.get_options(version_id)
424
if 'fulltext' == self._index.get_method(version_id):
425
new_content = self.factory.parse_fulltext(data, version_idx)
426
if parent is not None:
427
reference_content = self._get_content(parent)
428
old_texts = reference_content.text()
431
new_texts = new_content.text()
432
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
433
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
435
delta = self.factory.parse_line_delta(data, version_idx)
436
return parent, sha1, noeol, delta
438
def get_graph_with_ghosts(self):
439
"""See VersionedFile.get_graph_with_ghosts()."""
440
graph_items = self._index.get_graph()
441
return dict(graph_items)
443
def get_sha1(self, version_id):
444
"""See VersionedFile.get_sha1()."""
445
components = self._get_components(version_id)
446
return components[-1][-1][-1]
450
"""See VersionedFile.get_suffixes()."""
451
return [DATA_SUFFIX, INDEX_SUFFIX]
453
def has_ghost(self, version_id):
454
"""True if there is a ghost reference in the file to version_id."""
456
if self.has_version(version_id):
458
# optimisable if needed by memoising the _ghosts set.
459
items = self._index.get_graph()
460
for node, parents in items:
461
for parent in parents:
462
if parent not in self._index._cache:
463
if parent == version_id:
468
"""See VersionedFile.versions."""
469
return self._index.get_versions()
471
def has_version(self, version_id):
472
"""See VersionedFile.has_version."""
473
return self._index.has_version(version_id)
475
__contains__ = has_version
477
def _merge_annotations(self, content, parents, parent_texts={},
478
delta=None, annotated=None):
479
"""Merge annotations for content. This is done by comparing
480
the annotations based on changed to the text.
484
for parent_id in parents:
485
merge_content = self._get_content(parent_id, parent_texts)
486
seq = KnitSequenceMatcher(None, merge_content.text(), content.text())
487
if delta_seq is None:
488
# setup a delta seq to reuse.
490
for i, j, n in seq.get_matching_blocks():
493
# this appears to copy (origin, text) pairs across to the new
494
# content for any line that matches the last-checked parent.
495
# FIXME: save the sequence control data for delta compression
496
# against the most relevant parent rather than rediffing.
497
content._lines[j:j+n] = merge_content._lines[i:i+n]
500
reference_content = self._get_content(parents[0], parent_texts)
501
new_texts = content.text()
502
old_texts = reference_content.text()
503
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
504
return self._make_line_delta(delta_seq, content)
506
def _make_line_delta(self, delta_seq, new_content):
507
"""Generate a line delta from delta_seq and new_content."""
509
for op in delta_seq.get_opcodes():
512
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
515
def _get_component_versions(self, version_id):
516
basis = self.basis_knit
523
if basis and basis._index.has_version(cursor):
525
basis_versions.append(cursor)
526
method = picked_knit._index.get_method(cursor)
527
needed_versions.append((method, cursor))
528
if method == 'fulltext':
530
cursor = picked_knit.get_parents(cursor)[0]
531
return needed_versions, basis_versions
533
def _get_component_positions(self, version_id):
534
needed_versions, basis_versions = \
535
self._get_component_versions(version_id)
536
assert len(basis_versions) == 0
538
for method, comp_id in needed_versions:
539
data_pos, data_size = self._index.get_position(comp_id)
540
positions.append((method, comp_id, data_pos, data_size))
543
def _get_components(self, version_id):
544
"""Return a list of (version_id, method, data) tuples that
545
makes up version specified by version_id of the knit.
547
The components should be applied in the order of the returned
550
The basis knit will be used to the largest extent possible
551
since it is assumed that accesses to it is faster.
554
# 4168 calls in 14912, 2289 internal
555
# 4168 in 9711 to read_records
556
# 52554 in 1250 to get_parents
557
# 170166 in 865 to list.append
559
# needed_revisions holds a list of (method, version_id) of
560
# versions that is needed to be fetched to construct the final
561
# version of the file.
563
# basis_revisions is a list of versions that needs to be
564
# fetched but exists in the basis knit.
566
needed_versions, basis_versions = \
567
self._get_component_versions(version_id)
571
assert False, "I am broken"
572
basis = self.basis_knit
574
for comp_id in basis_versions:
575
data_pos, data_size = basis._index.get_data_position(comp_id)
576
records.append((comp_id, data_pos, data_size))
577
components.update(basis._data.read_records(records))
580
for comp_id in [vid for method, vid in needed_versions
581
if vid not in basis_versions]:
582
data_pos, data_size = self._index.get_position(comp_id)
583
records.append((comp_id, data_pos, data_size))
584
components.update(self._data.read_records(records))
586
# get_data_records returns a mapping with the version id as
587
# index and the value as data. The order the components need
588
# to be applied is held by needed_versions (reversed).
590
for method, comp_id in reversed(needed_versions):
591
out.append((comp_id, method, components[comp_id]))
595
def _get_content(self, version_id, parent_texts={}):
596
"""Returns a content object that makes up the specified
598
if not self.has_version(version_id):
599
raise RevisionNotPresent(version_id, self.filename)
601
cached_version = parent_texts.get(version_id, None)
602
if cached_version is not None:
603
return cached_version
605
if self.basis_knit and version_id in self.basis_knit:
606
return self.basis_knit._get_content(version_id)
609
components = self._get_components(version_id)
610
for component_id, method, (data, digest) in components:
611
version_idx = self._index.lookup(component_id)
612
if method == 'fulltext':
613
assert content is None
614
content = self.factory.parse_fulltext(data, version_idx)
615
elif method == 'line-delta':
616
delta = self.factory.parse_line_delta(data, version_idx)
617
content._lines = self._apply_delta(content._lines, delta)
619
if 'no-eol' in self._index.get_options(version_id):
620
line = content._lines[-1][1].rstrip('\n')
621
content._lines[-1] = (content._lines[-1][0], line)
623
# digest here is the digest from the last applied component.
624
if sha_strings(content.text()) != digest:
625
raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)
629
def _check_versions_present(self, version_ids):
630
"""Check that all specified versions are present."""
631
version_ids = set(version_ids)
632
for r in list(version_ids):
633
if self._index.has_version(r):
634
version_ids.remove(r)
636
raise RevisionNotPresent(list(version_ids)[0], self.filename)
638
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
639
"""See VersionedFile.add_lines_with_ghosts()."""
640
self._check_add(version_id, lines)
641
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
643
def _add_lines(self, version_id, parents, lines, parent_texts):
644
"""See VersionedFile.add_lines."""
645
self._check_add(version_id, lines)
646
self._check_versions_present(parents)
647
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
649
def _check_add(self, version_id, lines):
650
"""check that version_id and lines are safe to add."""
651
assert self.writable, "knit is not opened for write"
652
### FIXME escape. RBC 20060228
653
if contains_whitespace(version_id):
654
raise InvalidRevisionId(version_id, self.filename)
655
if self.has_version(version_id):
656
raise RevisionAlreadyPresent(version_id, self.filename)
657
self._check_lines_not_unicode(lines)
658
self._check_lines_are_lines(lines)
660
def _add(self, version_id, lines, parents, delta, parent_texts):
661
"""Add a set of lines on top of version specified by parents.
663
If delta is true, compress the text as a line-delta against
666
Any versions not present will be converted into ghosts.
668
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
669
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
670
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
671
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
672
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
673
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
674
# +1383 0 8.0370 8.0370 +<len>
675
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
676
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
677
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
678
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
682
if parent_texts is None:
684
for parent in parents:
685
if not self.has_version(parent):
686
ghosts.append(parent)
688
present_parents.append(parent)
690
if delta and not len(present_parents):
693
digest = sha_strings(lines)
696
if lines[-1][-1] != '\n':
697
options.append('no-eol')
698
lines[-1] = lines[-1] + '\n'
700
if len(present_parents) and delta:
701
# To speed the extract of texts the delta chain is limited
702
# to a fixed number of deltas. This should minimize both
703
# I/O and the time spend applying deltas.
705
delta_parents = present_parents
707
parent = delta_parents[0]
708
method = self._index.get_method(parent)
709
if method == 'fulltext':
711
delta_parents = self._index.get_parents(parent)
713
if method == 'line-delta':
716
lines = self.factory.make(lines, version_id)
717
if delta or (self.factory.annotated and len(present_parents) > 0):
718
# Merge annotations from parent texts if so is needed.
719
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
720
delta, self.factory.annotated)
723
options.append('line-delta')
724
store_lines = self.factory.lower_line_delta(delta_hunks)
726
options.append('fulltext')
727
store_lines = self.factory.lower_fulltext(lines)
729
where, size = self._data.add_record(version_id, digest, store_lines)
730
self._index.add_version(version_id, options, where, size, parents)
733
def check(self, progress_bar=None):
734
"""See VersionedFile.check()."""
736
def _clone_text(self, new_version_id, old_version_id, parents):
737
"""See VersionedFile.clone_text()."""
738
# FIXME RBC 20060228 make fast by only inserting an index with null
740
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
742
def get_lines(self, version_id):
743
"""See VersionedFile.get_lines()."""
744
return self.get_line_list([version_id])[0]
746
def _get_version_components(self, position_map):
748
for version_id, positions in position_map.iteritems():
749
for method, comp_id, position, size in positions:
750
records.append((comp_id, position, size))
751
record_map = self._data.read_records(records)
754
for version_id, positions in position_map.iteritems():
756
for method, comp_id, position, size in positions:
757
data, digest = record_map[comp_id]
758
components.append((comp_id, method, data, digest))
759
component_map[version_id] = components
762
def get_text(self, version_id):
763
"""See VersionedFile.get_text"""
764
return self.get_texts([version_id])[0]
766
def get_texts(self, version_ids):
767
return [''.join(l) for l in self.get_line_list(version_ids)]
769
def get_line_list(self, version_ids):
770
"""Return the texts of listed versions as a list of strings."""
772
for version_id in version_ids:
773
if not self.has_version(version_id):
774
raise RevisionNotPresent(version_id, self.filename)
775
position_map[version_id] = \
776
self._get_component_positions(version_id)
778
version_components = self._get_version_components(position_map).items()
781
for version_id, components in version_components:
783
for component_id, method, data, digest in reversed(components):
784
version_idx = self._index.lookup(component_id)
785
if method == 'fulltext':
786
assert content is None
787
content = self.factory.parse_fulltext(data, version_idx)
788
elif method == 'line-delta':
789
delta = self.factory.parse_line_delta(data, version_idx)
790
content._lines = self._apply_delta(content._lines, delta)
792
if 'no-eol' in self._index.get_options(version_id):
793
line = content._lines[-1][1].rstrip('\n')
794
content._lines[-1] = (content._lines[-1][0], line)
796
# digest here is the digest from the last applied component.
797
if sha_strings(content.text()) != digest:
798
raise KnitCorrupt(self.filename,
799
'sha-1 does not match %s' % version_id)
801
text_map[version_id] = content.text()
802
return [text_map[v] for v in version_ids]
804
def iter_lines_added_or_present_in_versions(self, version_ids=None):
805
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
806
if version_ids is None:
807
version_ids = self.versions()
808
# we don't care about inclusions, the caller cares.
809
# but we need to setup a list of records to visit.
810
# we need version_id, position, length
811
version_id_records = []
812
requested_versions = list(version_ids)
813
# filter for available versions
814
for version_id in requested_versions:
815
if not self.has_version(version_id):
816
raise RevisionNotPresent(version_id, self.filename)
817
# get a in-component-order queue:
819
for version_id in self.versions():
820
if version_id in requested_versions:
821
version_ids.append(version_id)
822
data_pos, length = self._index.get_position(version_id)
823
version_id_records.append((version_id, data_pos, length))
825
pb = bzrlib.ui.ui_factory.nested_progress_bar()
827
total = len(version_id_records)
829
pb.update('Walking content.', count, total)
830
for version_id, data, sha_value in \
831
self._data.read_records_iter(version_id_records):
832
pb.update('Walking content.', count, total)
833
method = self._index.get_method(version_id)
834
version_idx = self._index.lookup(version_id)
835
assert method in ('fulltext', 'line-delta')
836
if method == 'fulltext':
837
content = self.factory.parse_fulltext(data, version_idx)
838
for line in content.text():
841
delta = self.factory.parse_line_delta(data, version_idx)
842
for start, end, count, lines in delta:
843
for origin, line in lines:
846
pb.update('Walking content.', total, total)
849
pb.update('Walking content.', total, total)
853
def num_versions(self):
854
"""See VersionedFile.num_versions()."""
855
return self._index.num_versions()
857
__len__ = num_versions
859
def annotate_iter(self, version_id):
860
"""See VersionedFile.annotate_iter."""
861
content = self._get_content(version_id)
862
for origin, text in content.annotate_iter():
865
def get_parents(self, version_id):
866
"""See VersionedFile.get_parents."""
869
# 52554 calls in 1264 872 internal down from 3674
871
return self._index.get_parents(version_id)
873
raise RevisionNotPresent(version_id, self.filename)
875
def get_parents_with_ghosts(self, version_id):
876
"""See VersionedFile.get_parents."""
878
return self._index.get_parents_with_ghosts(version_id)
880
raise RevisionNotPresent(version_id, self.filename)
882
def get_ancestry(self, versions):
883
"""See VersionedFile.get_ancestry."""
884
if isinstance(versions, basestring):
885
versions = [versions]
888
self._check_versions_present(versions)
889
return self._index.get_ancestry(versions)
891
def get_ancestry_with_ghosts(self, versions):
892
"""See VersionedFile.get_ancestry_with_ghosts."""
893
if isinstance(versions, basestring):
894
versions = [versions]
897
self._check_versions_present(versions)
898
return self._index.get_ancestry_with_ghosts(versions)
900
#@deprecated_method(zero_eight)
901
def walk(self, version_ids):
902
"""See VersionedFile.walk."""
903
# We take the short path here, and extract all relevant texts
904
# and put them in a weave and let that do all the work. Far
905
# from optimal, but is much simpler.
906
# FIXME RB 20060228 this really is inefficient!
907
from bzrlib.weave import Weave
909
w = Weave(self.filename)
910
ancestry = self.get_ancestry(version_ids)
911
sorted_graph = topo_sort(self._index.get_graph())
912
version_list = [vid for vid in sorted_graph if vid in ancestry]
914
for version_id in version_list:
915
lines = self.get_lines(version_id)
916
w.add_lines(version_id, self.get_parents(version_id), lines)
918
for lineno, insert_id, dset, line in w.walk(version_ids):
919
yield lineno, insert_id, dset, line
921
def plan_merge(self, ver_a, ver_b):
922
"""See VersionedFile.plan_merge."""
923
ancestors_b = set(self.get_ancestry(ver_b))
924
def status_a(revision, text):
925
if revision in ancestors_b:
926
return 'killed-b', text
930
ancestors_a = set(self.get_ancestry(ver_a))
931
def status_b(revision, text):
932
if revision in ancestors_a:
933
return 'killed-a', text
937
annotated_a = self.annotate(ver_a)
938
annotated_b = self.annotate(ver_b)
939
plain_a = [t for (a, t) in annotated_a]
940
plain_b = [t for (a, t) in annotated_b]
941
blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
944
for ai, bi, l in blocks:
945
# process all mismatched sections
946
# (last mismatched section is handled because blocks always
947
# includes a 0-length last block)
948
for revision, text in annotated_a[a_cur:ai]:
949
yield status_a(revision, text)
950
for revision, text in annotated_b[b_cur:bi]:
951
yield status_b(revision, text)
953
# and now the matched section
956
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
957
assert text_a == text_b
958
yield "unchanged", text_a
961
class _KnitComponentFile(object):
962
"""One of the files used to implement a knit database"""
964
def __init__(self, transport, filename, mode, file_mode=None):
965
self._transport = transport
966
self._filename = filename
968
self._file_mode=file_mode
970
def write_header(self):
971
if self._transport.append(self._filename, StringIO(self.HEADER),
972
mode=self._file_mode):
973
raise KnitCorrupt(self._filename, 'misaligned after writing header')
975
def check_header(self, fp):
977
if line != self.HEADER:
978
raise KnitHeaderError(badline=line)
981
"""Commit is a nop."""
984
return '%s(%s)' % (self.__class__.__name__, self._filename)
987
class _KnitIndex(_KnitComponentFile):
988
"""Manages knit index file.
990
The index is already kept in memory and read on startup, to enable
991
fast lookups of revision information. The cursor of the index
992
file is always pointing to the end, making it easy to append
995
_cache is a cache for fast mapping from version id to a Index
998
_history is a cache for fast mapping from indexes to version ids.
1000
The index data format is dictionary compressed when it comes to
1001
parent references; a index entry may only have parents that with a
1002
lover index number. As a result, the index is topological sorted.
1004
Duplicate entries may be written to the index for a single version id
1005
if this is done then the latter one completely replaces the former:
1006
this allows updates to correct version and parent information.
1007
Note that the two entries may share the delta, and that successive
1008
annotations and references MUST point to the first entry.
1010
The index file on disc contains a header, followed by one line per knit
1011
record. The same revision can be present in an index file more than once.
1012
The first occurrence gets assigned a sequence number starting from 0.
1014
The format of a single line is
1015
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
1016
REVISION_ID is a utf8-encoded revision id
1017
FLAGS is a comma separated list of flags about the record. Values include
1018
no-eol, line-delta, fulltext.
1019
BYTE_OFFSET is the ascii representation of the byte offset in the data file
1020
that the the compressed data starts at.
1021
LENGTH is the ascii representation of the length of the data file.
1022
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
1024
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
1025
revision id already in the knit that is a parent of REVISION_ID.
1026
The ' :' marker is the end of record marker.
1029
when a write is interrupted to the index file, it will result in a line that
1030
does not end in ' :'. If the ' :' is not present at the end of a line, or at
1031
the end of the file, then the record that is missing it will be ignored by
1034
When writing new records to the index file, the data is preceded by '\n'
1035
to ensure that records always start on new lines even if the last write was
1036
interrupted. As a result its normal for the last line in the index to be
1037
missing a trailing newline. One can be added with no harmful effects.
1040
HEADER = "# bzr knit index 8\n"
1042
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
1043
# __slots__ = ['_cache', '_history', '_transport', '_filename']
1045
def _cache_version(self, version_id, options, pos, size, parents):
1046
"""Cache a version record in the history array and index cache.
1048
This is inlined into __init__ for performance. KEEP IN SYNC.
1049
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
1052
# only want the _history index to reference the 1st index entry
1054
if version_id not in self._cache:
1055
index = len(self._history)
1056
self._history.append(version_id)
1058
index = self._cache[version_id][5]
1059
self._cache[version_id] = (version_id,
1066
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1067
_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)
1069
# position in _history is the 'official' index for a revision
1070
# but the values may have come from a newer entry.
1071
# so - wc -l of a knit index is != the number of unique names
1074
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1079
pb.update('read knit index', count, total)
1080
fp = self._transport.get(self._filename)
1081
self.check_header(fp)
1082
# readlines reads the whole file at once:
1083
# bad for transports like http, good for local disk
1084
# we save 60 ms doing this one change (
1085
# from calling readline each time to calling
1087
# probably what we want for nice behaviour on
1088
# http is a incremental readlines that yields, or
1089
# a check for local vs non local indexes,
1090
for l in fp.readlines():
1092
if len(rec) < 5 or rec[-1] != ':':
1094
# FIXME: in the future we should determine if its a
1095
# short write - and ignore it
1096
# or a different failure, and raise. RBC 20060407
1100
#pb.update('read knit index', count, total)
1101
# See self._parse_parents
1103
for value in rec[4:-1]:
1105
# uncompressed reference
1106
parents.append(value[1:])
1108
# this is 15/4000ms faster than isinstance,
1110
# this function is called thousands of times a
1111
# second so small variations add up.
1112
assert value.__class__ is str
1113
parents.append(self._history[int(value)])
1114
# end self._parse_parents
1115
# self._cache_version(rec[0],
1116
# rec[1].split(','),
1120
# --- self._cache_version
1121
# only want the _history index to reference the 1st
1122
# index entry for version_id
1124
if version_id not in self._cache:
1125
index = len(self._history)
1126
self._history.append(version_id)
1128
index = self._cache[version_id][5]
1129
self._cache[version_id] = (version_id,
1135
# --- self._cache_version
1136
except NoSuchFile, e:
1137
if mode != 'w' or not create:
1141
pb.update('read knit index', total, total)
1144
def _parse_parents(self, compressed_parents):
1145
"""convert a list of string parent values into version ids.
1147
ints are looked up in the index.
1148
.FOO values are ghosts and converted in to FOO.
1150
NOTE: the function is retained here for clarity, and for possible
1151
use in partial index reads. However bulk processing now has
1152
it inlined in __init__ for inner-loop optimisation.
1155
for value in compressed_parents:
1156
if value[-1] == '.':
1157
# uncompressed reference
1158
result.append(value[1:])
1160
# this is 15/4000ms faster than isinstance,
1161
# this function is called thousands of times a
1162
# second so small variations add up.
1163
assert value.__class__ is str
1164
result.append(self._history[int(value)])
1167
def get_graph(self):
1169
for version_id, index in self._cache.iteritems():
1170
graph.append((version_id, index[4]))
1173
def get_ancestry(self, versions):
1174
"""See VersionedFile.get_ancestry."""
1175
# get a graph of all the mentioned versions:
1177
pending = set(versions)
1179
version = pending.pop()
1180
parents = self._cache[version][4]
1181
# got the parents ok
1183
parents = [parent for parent in parents if parent in self._cache]
1184
for parent in parents:
1185
# if not completed and not a ghost
1186
if parent not in graph:
1188
graph[version] = parents
1189
return topo_sort(graph.items())
1191
def get_ancestry_with_ghosts(self, versions):
1192
"""See VersionedFile.get_ancestry_with_ghosts."""
1193
# get a graph of all the mentioned versions:
1195
pending = set(versions)
1197
version = pending.pop()
1199
parents = self._cache[version][4]
1205
# got the parents ok
1206
for parent in parents:
1207
if parent not in graph:
1209
graph[version] = parents
1210
return topo_sort(graph.items())
1212
def num_versions(self):
1213
return len(self._history)
1215
__len__ = num_versions
1217
def get_versions(self):
1218
return self._history
1220
def idx_to_name(self, idx):
1221
return self._history[idx]
1223
def lookup(self, version_id):
1224
assert version_id in self._cache
1225
return self._cache[version_id][5]
1227
def _version_list_to_index(self, versions):
1229
for version in versions:
1230
if version in self._cache:
1231
# -- inlined lookup() --
1232
result_list.append(str(self._cache[version][5]))
1233
# -- end lookup () --
1235
result_list.append('.' + version.encode('utf-8'))
1236
return ' '.join(result_list)
1238
def add_version(self, version_id, options, pos, size, parents):
1239
"""Add a version record to the index."""
1240
self.add_versions(((version_id, options, pos, size, parents),))
1242
def add_versions(self, versions):
1243
"""Add multiple versions to the index.
1245
:param versions: a list of tuples:
1246
(version_id, options, pos, size, parents).
1249
for version_id, options, pos, size, parents in versions:
1250
line = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),
1254
self._version_list_to_index(parents))
1255
assert isinstance(line, str), \
1256
'content must be utf-8 encoded: %r' % (line,)
1258
self._transport.append(self._filename, StringIO(''.join(lines)))
1259
# cache after writing, so that a failed write leads to missing cache
1260
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1261
# failure, reload the index or flush it or some such, to prevent
1262
# writing records that did complete twice.
1263
for version_id, options, pos, size, parents in versions:
1264
self._cache_version(version_id, options, pos, size, parents)
1266
def has_version(self, version_id):
1267
"""True if the version is in the index."""
1268
return self._cache.has_key(version_id)
1270
def get_position(self, version_id):
1271
"""Return data position and size of specified version."""
1272
return (self._cache[version_id][2], \
1273
self._cache[version_id][3])
1275
def get_method(self, version_id):
1276
"""Return compression method of specified version."""
1277
options = self._cache[version_id][1]
1278
if 'fulltext' in options:
1281
assert 'line-delta' in options
1284
def get_options(self, version_id):
1285
return self._cache[version_id][1]
1287
def get_parents(self, version_id):
1288
"""Return parents of specified version ignoring ghosts."""
1289
return [parent for parent in self._cache[version_id][4]
1290
if parent in self._cache]
1292
def get_parents_with_ghosts(self, version_id):
1293
"""Return parents of specified version with ghosts."""
1294
return self._cache[version_id][4]
1296
def check_versions_present(self, version_ids):
1297
"""Check that all specified versions are present."""
1298
version_ids = set(version_ids)
1299
for version_id in list(version_ids):
1300
if version_id in self._cache:
1301
version_ids.remove(version_id)
1303
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1306
class _KnitData(_KnitComponentFile):
1307
"""Contents of the knit data file"""
1309
HEADER = "# bzr knit data 8\n"
1311
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1312
_KnitComponentFile.__init__(self, transport, filename, mode)
1314
self._checked = False
1316
self._transport.put(self._filename, StringIO(''), mode=file_mode)
1319
def clear_cache(self):
1320
"""Clear the record cache."""
1323
def _open_file(self):
1324
if self._file is None:
1326
self._file = self._transport.get(self._filename)
1331
def _record_to_data(self, version_id, digest, lines):
1332
"""Convert version_id, digest, lines into a raw data block.
1334
:return: (len, a StringIO instance with the raw data ready to read.)
1337
data_file = GzipFile(None, mode='wb', fileobj=sio)
1338
data_file.writelines(chain(
1339
["version %s %d %s\n" % (version_id.encode('utf-8'),
1343
["end %s\n" % version_id.encode('utf-8')]))
1350
def add_raw_record(self, raw_data):
1351
"""Append a prepared record to the data file.
1353
:return: the offset in the data file raw_data was written.
1355
assert isinstance(raw_data, str), 'data must be plain bytes'
1356
return self._transport.append(self._filename, StringIO(raw_data))
1358
def add_record(self, version_id, digest, lines):
1359
"""Write new text record to disk. Returns the position in the
1360
file where it was written."""
1361
size, sio = self._record_to_data(version_id, digest, lines)
1363
self._records[version_id] = (digest, lines)
1365
start_pos = self._transport.append(self._filename, sio)
1366
return start_pos, size
1368
def _parse_record_header(self, version_id, raw_data):
1369
"""Parse a record header for consistency.
1371
:return: the header and the decompressor stream.
1372
as (stream, header_record)
1374
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1375
rec = df.readline().split()
1377
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1378
if rec[1].decode('utf-8')!= version_id:
1379
raise KnitCorrupt(self._filename,
1380
'unexpected version, wanted %r, got %r' % (
1381
version_id, rec[1]))
1384
def _parse_record(self, version_id, data):
1386
# 4168 calls in 2880 217 internal
1387
# 4168 calls to _parse_record_header in 2121
1388
# 4168 calls to readlines in 330
1389
df, rec = self._parse_record_header(version_id, data)
1390
record_contents = df.readlines()
1391
l = record_contents.pop()
1392
assert len(record_contents) == int(rec[2])
1393
if l.decode('utf-8') != 'end %s\n' % version_id:
1394
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1397
return record_contents, rec[3]
1399
def read_records_iter_raw(self, records):
1400
"""Read text records from data file and yield raw data.
1402
This unpacks enough of the text record to validate the id is
1403
as expected but thats all.
1405
It will actively recompress currently cached records on the
1406
basis that that is cheaper than I/O activity.
1409
for version_id, pos, size in records:
1410
if version_id not in self._records:
1411
needed_records.append((version_id, pos, size))
1413
# setup an iterator of the external records:
1414
# uses readv so nice and fast we hope.
1415
if len(needed_records):
1416
# grab the disk data needed.
1417
raw_records = self._transport.readv(self._filename,
1418
[(pos, size) for version_id, pos, size in needed_records])
1420
for version_id, pos, size in records:
1421
if version_id in self._records:
1422
# compress a new version
1423
size, sio = self._record_to_data(version_id,
1424
self._records[version_id][0],
1425
self._records[version_id][1])
1426
yield version_id, sio.getvalue()
1428
pos, data = raw_records.next()
1429
# validate the header
1430
df, rec = self._parse_record_header(version_id, data)
1432
yield version_id, data
1435
def read_records_iter(self, records):
1436
"""Read text records from data file and yield result.
1438
Each passed record is a tuple of (version_id, pos, len) and
1439
will be read in the given order. Yields (version_id,
1443
# 60890 calls for 4168 extractions in 5045, 683 internal.
1444
# 4168 calls to readv in 1411
1445
# 4168 calls to parse_record in 2880
1448
for version_id, pos, size in records:
1449
if version_id not in self._records:
1450
needed_records.append((version_id, pos, size))
1452
if len(needed_records):
1453
needed_records.sort(key=operator.itemgetter(1))
1454
# We take it that the transport optimizes the fetching as good
1455
# as possible (ie, reads continuous ranges.)
1456
response = self._transport.readv(self._filename,
1457
[(pos, size) for version_id, pos, size in needed_records])
1459
for (record_id, pos, size), (pos, data) in \
1460
izip(iter(needed_records), response):
1461
content, digest = self._parse_record(record_id, data)
1462
self._records[record_id] = (digest, content)
1464
for version_id, pos, size in records:
1465
yield version_id, list(self._records[version_id][1]), self._records[version_id][0]
1467
def read_records(self, records):
1468
"""Read records into a dictionary."""
1470
for record_id, content, digest in self.read_records_iter(records):
1471
components[record_id] = (content, digest)
1475
class InterKnit(InterVersionedFile):
1476
"""Optimised code paths for knit to knit operations."""
1478
_matching_file_from_factory = KnitVersionedFile
1479
_matching_file_to_factory = KnitVersionedFile
1482
def is_compatible(source, target):
1483
"""Be compatible with knits. """
1485
return (isinstance(source, KnitVersionedFile) and
1486
isinstance(target, KnitVersionedFile))
1487
except AttributeError:
1490
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1491
"""See InterVersionedFile.join."""
1492
assert isinstance(self.source, KnitVersionedFile)
1493
assert isinstance(self.target, KnitVersionedFile)
1495
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1500
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1502
version_ids = list(version_ids)
1503
if None in version_ids:
1504
version_ids.remove(None)
1506
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1507
this_versions = set(self.target._index.get_versions())
1508
needed_versions = self.source_ancestry - this_versions
1509
cross_check_versions = self.source_ancestry.intersection(this_versions)
1510
mismatched_versions = set()
1511
for version in cross_check_versions:
1512
# scan to include needed parents.
1513
n1 = set(self.target.get_parents_with_ghosts(version))
1514
n2 = set(self.source.get_parents_with_ghosts(version))
1516
# FIXME TEST this check for cycles being introduced works
1517
# the logic is we have a cycle if in our graph we are an
1518
# ancestor of any of the n2 revisions.
1524
parent_ancestors = self.source.get_ancestry(parent)
1525
if version in parent_ancestors:
1526
raise errors.GraphCycleError([parent, version])
1527
# ensure this parent will be available later.
1528
new_parents = n2.difference(n1)
1529
needed_versions.update(new_parents.difference(this_versions))
1530
mismatched_versions.add(version)
1532
if not needed_versions and not mismatched_versions:
1534
full_list = topo_sort(self.source.get_graph())
1536
version_list = [i for i in full_list if (not self.target.has_version(i)
1537
and i in needed_versions)]
1541
copy_queue_records = []
1543
for version_id in version_list:
1544
options = self.source._index.get_options(version_id)
1545
parents = self.source._index.get_parents_with_ghosts(version_id)
1546
# check that its will be a consistent copy:
1547
for parent in parents:
1548
# if source has the parent, we must :
1549
# * already have it or
1550
# * have it scheduled already
1551
# otherwise we don't care
1552
assert (self.target.has_version(parent) or
1553
parent in copy_set or
1554
not self.source.has_version(parent))
1555
data_pos, data_size = self.source._index.get_position(version_id)
1556
copy_queue_records.append((version_id, data_pos, data_size))
1557
copy_queue.append((version_id, options, parents))
1558
copy_set.add(version_id)
1560
# data suck the join:
1562
total = len(version_list)
1565
for (version_id, raw_data), \
1566
(version_id2, options, parents) in \
1567
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1569
assert version_id == version_id2, 'logic error, inconsistent results'
1571
pb.update("Joining knit", count, total)
1572
raw_records.append((version_id, options, parents, len(raw_data)))
1573
raw_datum.append(raw_data)
1574
self.target._add_raw_records(raw_records, ''.join(raw_datum))
1576
for version in mismatched_versions:
1577
# FIXME RBC 20060309 is this needed?
1578
n1 = set(self.target.get_parents_with_ghosts(version))
1579
n2 = set(self.source.get_parents_with_ghosts(version))
1580
# write a combined record to our history preserving the current
1581
# parents as first in the list
1582
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1583
self.target.fix_parents(version, new_parents)
1589
InterVersionedFile.register_optimiser(InterKnit)
1592
class WeaveToKnit(InterVersionedFile):
1593
"""Optimised code paths for weave to knit operations."""
1595
_matching_file_from_factory = bzrlib.weave.WeaveFile
1596
_matching_file_to_factory = KnitVersionedFile
1599
def is_compatible(source, target):
1600
"""Be compatible with weaves to knits."""
1602
return (isinstance(source, bzrlib.weave.Weave) and
1603
isinstance(target, KnitVersionedFile))
1604
except AttributeError:
1607
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1608
"""See InterVersionedFile.join."""
1609
assert isinstance(self.source, bzrlib.weave.Weave)
1610
assert isinstance(self.target, KnitVersionedFile)
1612
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1617
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1619
version_ids = list(version_ids)
1621
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1622
this_versions = set(self.target._index.get_versions())
1623
needed_versions = self.source_ancestry - this_versions
1624
cross_check_versions = self.source_ancestry.intersection(this_versions)
1625
mismatched_versions = set()
1626
for version in cross_check_versions:
1627
# scan to include needed parents.
1628
n1 = set(self.target.get_parents_with_ghosts(version))
1629
n2 = set(self.source.get_parents(version))
1630
# if all of n2's parents are in n1, then its fine.
1631
if n2.difference(n1):
1632
# FIXME TEST this check for cycles being introduced works
1633
# the logic is we have a cycle if in our graph we are an
1634
# ancestor of any of the n2 revisions.
1640
parent_ancestors = self.source.get_ancestry(parent)
1641
if version in parent_ancestors:
1642
raise errors.GraphCycleError([parent, version])
1643
# ensure this parent will be available later.
1644
new_parents = n2.difference(n1)
1645
needed_versions.update(new_parents.difference(this_versions))
1646
mismatched_versions.add(version)
1648
if not needed_versions and not mismatched_versions:
1650
full_list = topo_sort(self.source.get_graph())
1652
version_list = [i for i in full_list if (not self.target.has_version(i)
1653
and i in needed_versions)]
1657
total = len(version_list)
1658
for version_id in version_list:
1659
pb.update("Converting to knit", count, total)
1660
parents = self.source.get_parents(version_id)
1661
# check that its will be a consistent copy:
1662
for parent in parents:
1663
# if source has the parent, we must already have it
1664
assert (self.target.has_version(parent))
1665
self.target.add_lines(
1666
version_id, parents, self.source.get_lines(version_id))
1669
for version in mismatched_versions:
1670
# FIXME RBC 20060309 is this needed?
1671
n1 = set(self.target.get_parents_with_ghosts(version))
1672
n2 = set(self.source.get_parents(version))
1673
# write a combined record to our history preserving the current
1674
# parents as first in the list
1675
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1676
self.target.fix_parents(version, new_parents)
1682
InterVersionedFile.register_optimiser(WeaveToKnit)
1685
class KnitSequenceMatcher(difflib.SequenceMatcher):
1686
"""Knit tuned sequence matcher.
1688
This is based on profiling of difflib which indicated some improvements
1689
for our usage pattern.
1692
def find_longest_match(self, alo, ahi, blo, bhi):
1693
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1695
If isjunk is not defined:
1697
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1698
alo <= i <= i+k <= ahi
1699
blo <= j <= j+k <= bhi
1700
and for all (i',j',k') meeting those conditions,
1703
and if i == i', j <= j'
1705
In other words, of all maximal matching blocks, return one that
1706
starts earliest in a, and of all those maximal matching blocks that
1707
start earliest in a, return the one that starts earliest in b.
1709
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1710
>>> s.find_longest_match(0, 5, 0, 9)
1713
If isjunk is defined, first the longest matching block is
1714
determined as above, but with the additional restriction that no
1715
junk element appears in the block. Then that block is extended as
1716
far as possible by matching (only) junk elements on both sides. So
1717
the resulting block never matches on junk except as identical junk
1718
happens to be adjacent to an "interesting" match.
1720
Here's the same example as before, but considering blanks to be
1721
junk. That prevents " abcd" from matching the " abcd" at the tail
1722
end of the second sequence directly. Instead only the "abcd" can
1723
match, and matches the leftmost "abcd" in the second sequence:
1725
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1726
>>> s.find_longest_match(0, 5, 0, 9)
1729
If no blocks match, return (alo, blo, 0).
1731
>>> s = SequenceMatcher(None, "ab", "c")
1732
>>> s.find_longest_match(0, 2, 0, 1)
1736
# CAUTION: stripping common prefix or suffix would be incorrect.
1740
# Longest matching block is "ab", but if common prefix is
1741
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1742
# strip, so ends up claiming that ab is changed to acab by
1743
# inserting "ca" in the middle. That's minimal but unintuitive:
1744
# "it's obvious" that someone inserted "ac" at the front.
1745
# Windiff ends up at the same place as diff, but by pairing up
1746
# the unique 'b's and then matching the first two 'a's.
1748
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1749
besti, bestj, bestsize = alo, blo, 0
1750
# find longest junk-free match
1751
# during an iteration of the loop, j2len[j] = length of longest
1752
# junk-free match ending with a[i-1] and b[j]
1756
for i in xrange(alo, ahi):
1757
# look at all instances of a[i] in b; note that because
1758
# b2j has no junk keys, the loop is skipped if a[i] is junk
1759
j2lenget = j2len.get
1762
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1763
# following improvement
1764
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1765
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1766
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1768
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1769
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1770
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1782
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1784
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1787
# Extend the best by non-junk elements on each end. In particular,
1788
# "popular" non-junk elements aren't in b2j, which greatly speeds
1789
# the inner loop above, but also means "the best" match so far
1790
# doesn't contain any junk *or* popular non-junk elements.
1791
while besti > alo and bestj > blo and \
1792
not isbjunk(b[bestj-1]) and \
1793
a[besti-1] == b[bestj-1]:
1794
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1795
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1796
not isbjunk(b[bestj+bestsize]) and \
1797
a[besti+bestsize] == b[bestj+bestsize]:
1800
# Now that we have a wholly interesting match (albeit possibly
1801
# empty!), we may as well suck up the matching junk on each
1802
# side of it too. Can't think of a good reason not to, and it
1803
# saves post-processing the (possibly considerable) expense of
1804
# figuring out what to do with it. In the case of an empty
1805
# interesting match, this is clearly the right thing to do,
1806
# because no other kind of match is possible in the regions.
1807
while besti > alo and bestj > blo and \
1808
isbjunk(b[bestj-1]) and \
1809
a[besti-1] == b[bestj-1]:
1810
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1811
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1812
isbjunk(b[bestj+bestsize]) and \
1813
a[besti+bestsize] == b[bestj+bestsize]:
1814
bestsize = bestsize + 1
1816
return besti, bestj, bestsize