1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
"""Knit versionedfile implementation.
22
A knit is a versioned file implementation that supports efficient append only
26
lifeless: the data file is made up of "delta records". each delta record has a delta header
27
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
28
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
29
end-marker; simply "end VERSION"
31
delta can be line or full contents.a
32
... the 8's there are the index number of the annotation.
33
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
37
8 e.set('executable', 'yes')
39
8 if elt.get('executable') == 'yes':
40
8 ie.executable = True
41
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
45
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
46
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
47
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
48
09:33 < lifeless> right
49
09:33 < jrydberg> lifeless: the position and size is the range in the data file
52
so the index sequence is the dictionary compressed sequence number used
53
in the deltas to provide line annotation
58
# 10:16 < lifeless> make partial index writes safe
59
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
60
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
62
# move sha1 out of the content so that join is faster at verifying parents
63
# record content length ?
67
from cStringIO import StringIO
70
from itertools import izip, chain
75
import bzrlib.errors as errors
76
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
77
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
78
RevisionNotPresent, RevisionAlreadyPresent
79
from bzrlib.trace import mutter
80
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
82
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
83
from bzrlib.tsort import topo_sort
86
# TODO: Split out code specific to this format into an associated object.
88
# TODO: Can we put in some kind of value to check that the index and data
89
# files belong together?
91
# TODO: accomodate binaries, perhaps by storing a byte count
93
# TODO: function to check whole file
95
# TODO: atomically append data, then measure backwards from the cursor
96
# position after writing to work out where it was located. we may need to
97
# bypass python file buffering.
100
INDEX_SUFFIX = '.kndx'
103
class KnitContent(object):
104
"""Content of a knit version to which deltas can be applied."""
106
def __init__(self, lines):
109
def annotate_iter(self):
110
"""Yield tuples of (origin, text) for each content line."""
111
for origin, text in self._lines:
115
"""Return a list of (origin, text) tuples."""
116
return list(self.annotate_iter())
118
def line_delta_iter(self, new_lines):
119
"""Generate line-based delta from this content to new_lines."""
120
new_texts = [text for origin, text in new_lines._lines]
121
old_texts = [text for origin, text in self._lines]
122
s = SequenceMatcher(None, old_texts, new_texts)
123
for op in s.get_opcodes():
126
# ofrom oto length data
127
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
129
def line_delta(self, new_lines):
130
return list(self.line_delta_iter(new_lines))
133
return [text for origin, text in self._lines]
136
class _KnitFactory(object):
137
"""Base factory for creating content objects."""
139
def make(self, lines, version):
140
num_lines = len(lines)
141
return KnitContent(zip([version] * num_lines, lines))
144
class KnitAnnotateFactory(_KnitFactory):
145
"""Factory for creating annotated Content objects."""
149
def parse_fulltext(self, content, version):
150
"""Convert fulltext to internal representation
152
fulltext content is of the format
153
revid(utf8) plaintext\n
154
internal representation is of the format:
159
origin, text = line.split(' ', 1)
160
lines.append((origin.decode('utf-8'), text))
161
return KnitContent(lines)
163
def parse_line_delta_iter(self, lines):
164
"""Convert a line based delta into internal representation.
166
line delta is in the form of:
167
intstart intend intcount
169
revid(utf8) newline\n
170
internal represnetation is
171
(start, end, count, [1..count tuples (revid, newline)])
174
header = lines.pop(0)
175
start, end, c = [int(n) for n in header.split(',')]
178
origin, text = lines.pop(0).split(' ', 1)
179
contents.append((origin.decode('utf-8'), text))
180
yield start, end, c, contents
182
def parse_line_delta(self, lines, version):
183
return list(self.parse_line_delta_iter(lines))
185
def lower_fulltext(self, content):
186
"""convert a fulltext content record into a serializable form.
188
see parse_fulltext which this inverts.
190
return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]
192
def lower_line_delta(self, delta):
193
"""convert a delta into a serializable form.
195
See parse_line_delta_iter which this inverts.
198
for start, end, c, lines in delta:
199
out.append('%d,%d,%d\n' % (start, end, c))
200
for origin, text in lines:
201
out.append('%s %s' % (origin.encode('utf-8'), text))
205
class KnitPlainFactory(_KnitFactory):
206
"""Factory for creating plain Content objects."""
210
def parse_fulltext(self, content, version):
211
"""This parses an unannotated fulltext.
213
Note that this is not a noop - the internal representation
214
has (versionid, line) - its just a constant versionid.
216
return self.make(content, version)
218
def parse_line_delta_iter(self, lines, version):
220
header = lines.pop(0)
221
start, end, c = [int(n) for n in header.split(',')]
222
yield start, end, c, zip([version] * c, lines[:c])
225
def parse_line_delta(self, lines, version):
226
return list(self.parse_line_delta_iter(lines, version))
228
def lower_fulltext(self, content):
229
return content.text()
231
def lower_line_delta(self, delta):
233
for start, end, c, lines in delta:
234
out.append('%d,%d,%d\n' % (start, end, c))
235
out.extend([text for origin, text in lines])
239
def make_empty_knit(transport, relpath):
240
"""Construct a empty knit at the specified location."""
241
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
245
class KnitVersionedFile(VersionedFile):
246
"""Weave-like structure with faster random access.
248
A knit stores a number of texts and a summary of the relationships
249
between them. Texts are identified by a string version-id. Texts
250
are normally stored and retrieved as a series of lines, but can
251
also be passed as single strings.
253
Lines are stored with the trailing newline (if any) included, to
254
avoid special cases for files with no final newline. Lines are
255
composed of 8-bit characters, not unicode. The combination of
256
these approaches should mean any 'binary' file can be safely
257
stored and retrieved.
260
def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,
261
basis_knit=None, delta=True, create=False):
262
"""Construct a knit at location specified by relpath.
264
:param create: If not True, only open an existing knit.
266
if access_mode is None:
268
super(KnitVersionedFile, self).__init__(access_mode)
269
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
270
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
273
self.transport = transport
274
self.filename = relpath
275
self.basis_knit = basis_knit
276
self.factory = factory or KnitAnnotateFactory()
277
self.writable = (access_mode == 'w')
280
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
281
access_mode, create=create)
282
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
283
access_mode, create=not len(self.versions()))
285
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
286
"""See VersionedFile._add_delta()."""
287
self._check_add(version_id, []) # should we check the lines ?
288
self._check_versions_present(parents)
292
for parent in parents:
293
if not self.has_version(parent):
294
ghosts.append(parent)
296
present_parents.append(parent)
298
if delta_parent is None:
299
# reconstitute as full text.
300
assert len(delta) == 1 or len(delta) == 0
302
assert delta[0][0] == 0
303
assert delta[0][1] == 0, delta[0][1]
304
return super(KnitVersionedFile, self)._add_delta(version_id,
315
options.append('no-eol')
317
if delta_parent is not None:
318
# determine the current delta chain length.
319
# To speed the extract of texts the delta chain is limited
320
# to a fixed number of deltas. This should minimize both
321
# I/O and the time spend applying deltas.
323
delta_parents = [delta_parent]
325
parent = delta_parents[0]
326
method = self._index.get_method(parent)
327
if method == 'fulltext':
329
delta_parents = self._index.get_parents(parent)
331
if method == 'line-delta':
332
# did not find a fulltext in the delta limit.
333
# just do a normal insertion.
334
return super(KnitVersionedFile, self)._add_delta(version_id,
341
options.append('line-delta')
342
store_lines = self.factory.lower_line_delta(delta)
344
where, size = self._data.add_record(version_id, digest, store_lines)
345
self._index.add_version(version_id, options, where, size, parents)
347
def clear_cache(self):
348
"""Clear the data cache only."""
349
self._data.clear_cache()
351
def copy_to(self, name, transport):
352
"""See VersionedFile.copy_to()."""
353
# copy the current index to a temp index to avoid racing with local
355
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))
357
transport.put(name + DATA_SUFFIX, self._data._open_file())
358
# rename the copied index into place
359
transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
361
def create_empty(self, name, transport, mode=None):
362
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
364
def _fix_parents(self, version, new_parents):
365
"""Fix the parents list for version.
367
This is done by appending a new version to the index
368
with identical data except for the parents list.
369
the parents list must be a superset of the current
372
current_values = self._index._cache[version]
373
assert set(current_values[4]).difference(set(new_parents)) == set()
374
self._index.add_version(version,
380
def get_delta(self, version_id):
381
"""Get a delta for constructing version from some other version."""
382
if not self.has_version(version_id):
383
raise RevisionNotPresent(version_id, self.filename)
385
parents = self.get_parents(version_id)
390
data_pos, data_size = self._index.get_position(version_id)
391
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
392
version_idx = self._index.lookup(version_id)
393
noeol = 'no-eol' in self._index.get_options(version_id)
394
if 'fulltext' == self._index.get_method(version_id):
395
new_content = self.factory.parse_fulltext(data, version_idx)
396
if parent is not None:
397
reference_content = self._get_content(parent)
398
old_texts = reference_content.text()
401
new_texts = new_content.text()
402
delta_seq = SequenceMatcher(None, old_texts, new_texts)
403
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
405
delta = self.factory.parse_line_delta(data, version_idx)
406
return parent, sha1, noeol, delta
408
def get_graph_with_ghosts(self):
409
"""See VersionedFile.get_graph_with_ghosts()."""
410
graph_items = self._index.get_graph()
411
return dict(graph_items)
415
"""See VersionedFile.get_suffixes()."""
416
return [DATA_SUFFIX, INDEX_SUFFIX]
418
def has_ghost(self, version_id):
419
"""True if there is a ghost reference in the file to version_id."""
421
if self.has_version(version_id):
423
# optimisable if needed by memoising the _ghosts set.
424
items = self._index.get_graph()
425
for node, parents in items:
426
for parent in parents:
427
if parent not in self._index._cache:
428
if parent == version_id:
433
"""See VersionedFile.versions."""
434
return self._index.get_versions()
436
def has_version(self, version_id):
437
"""See VersionedFile.has_version."""
438
return self._index.has_version(version_id)
440
__contains__ = has_version
442
def _merge_annotations(self, content, parents, parent_texts={},
443
delta=None, annotated=None):
444
"""Merge annotations for content. This is done by comparing
445
the annotations based on changed to the text.
449
for parent_id in parents:
450
merge_content = self._get_content(parent_id, parent_texts)
451
seq = SequenceMatcher(None, merge_content.text(), content.text())
452
if delta_seq is None:
453
# setup a delta seq to reuse.
455
for i, j, n in seq.get_matching_blocks():
458
# this appears to copy (origin, text) pairs across to the new
459
# content for any line that matches the last-checked parent.
460
# FIXME: save the sequence control data for delta compression
461
# against the most relevant parent rather than rediffing.
462
content._lines[j:j+n] = merge_content._lines[i:i+n]
465
reference_content = self._get_content(parents[0], parent_texts)
466
new_texts = content.text()
467
old_texts = reference_content.text()
468
delta_seq = SequenceMatcher(None, old_texts, new_texts)
469
return self._make_line_delta(delta_seq, content)
471
def _make_line_delta(self, delta_seq, new_content):
472
"""Generate a line delta from delta_seq and new_content."""
474
for op in delta_seq.get_opcodes():
477
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
480
def _get_components(self, version_id):
481
"""Return a list of (version_id, method, data) tuples that
482
makes up version specified by version_id of the knit.
484
The components should be applied in the order of the returned
487
The basis knit will be used to the largest extent possible
488
since it is assumed that accesses to it is faster.
490
# needed_revisions holds a list of (method, version_id) of
491
# versions that is needed to be fetched to construct the final
492
# version of the file.
494
# basis_revisions is a list of versions that needs to be
495
# fetched but exists in the basis knit.
497
basis = self.basis_knit
504
if basis and basis._index.has_version(cursor):
506
basis_versions.append(cursor)
507
method = picked_knit._index.get_method(cursor)
508
needed_versions.append((method, cursor))
509
if method == 'fulltext':
511
cursor = picked_knit.get_parents(cursor)[0]
516
for comp_id in basis_versions:
517
data_pos, data_size = basis._index.get_data_position(comp_id)
518
records.append((piece_id, data_pos, data_size))
519
components.update(basis._data.read_records(records))
522
for comp_id in [vid for method, vid in needed_versions
523
if vid not in basis_versions]:
524
data_pos, data_size = self._index.get_position(comp_id)
525
records.append((comp_id, data_pos, data_size))
526
components.update(self._data.read_records(records))
528
# get_data_records returns a mapping with the version id as
529
# index and the value as data. The order the components need
530
# to be applied is held by needed_versions (reversed).
532
for method, comp_id in reversed(needed_versions):
533
out.append((comp_id, method, components[comp_id]))
537
def _get_content(self, version_id, parent_texts={}):
538
"""Returns a content object that makes up the specified
540
if not self.has_version(version_id):
541
raise RevisionNotPresent(version_id, self.filename)
543
cached_version = parent_texts.get(version_id, None)
544
if cached_version is not None:
545
return cached_version
547
if self.basis_knit and version_id in self.basis_knit:
548
return self.basis_knit._get_content(version_id)
551
components = self._get_components(version_id)
552
for component_id, method, (data, digest) in components:
553
version_idx = self._index.lookup(component_id)
554
if method == 'fulltext':
555
assert content is None
556
content = self.factory.parse_fulltext(data, version_idx)
557
elif method == 'line-delta':
558
delta = self.factory.parse_line_delta(data, version_idx)
559
content._lines = self._apply_delta(content._lines, delta)
561
if 'no-eol' in self._index.get_options(version_id):
562
line = content._lines[-1][1].rstrip('\n')
563
content._lines[-1] = (content._lines[-1][0], line)
565
if sha_strings(content.text()) != digest:
566
import pdb;pdb.set_trace()
567
raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)
571
def _check_versions_present(self, version_ids):
572
"""Check that all specified versions are present."""
573
version_ids = set(version_ids)
574
for r in list(version_ids):
575
if self._index.has_version(r):
576
version_ids.remove(r)
578
raise RevisionNotPresent(list(version_ids)[0], self.filename)
580
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
581
"""See VersionedFile.add_lines_with_ghosts()."""
582
self._check_add(version_id, lines)
583
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
585
def _add_lines(self, version_id, parents, lines, parent_texts):
586
"""See VersionedFile.add_lines."""
587
self._check_add(version_id, lines)
588
self._check_versions_present(parents)
589
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
591
def _check_add(self, version_id, lines):
592
"""check that version_id and lines are safe to add."""
593
assert self.writable, "knit is not opened for write"
594
### FIXME escape. RBC 20060228
595
if contains_whitespace(version_id):
596
raise InvalidRevisionId(version_id)
597
if self.has_version(version_id):
598
raise RevisionAlreadyPresent(version_id, self.filename)
600
if False or __debug__:
602
assert '\n' not in l[:-1]
604
def _add(self, version_id, lines, parents, delta, parent_texts):
605
"""Add a set of lines on top of version specified by parents.
607
If delta is true, compress the text as a line-delta against
610
Any versions not present will be converted into ghosts.
612
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
613
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
614
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
615
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
616
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
617
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
618
# +1383 0 8.0370 8.0370 +<len>
619
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
620
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
621
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
622
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
626
if parent_texts is None:
628
for parent in parents:
629
if not self.has_version(parent):
630
ghosts.append(parent)
632
present_parents.append(parent)
634
if delta and not len(present_parents):
637
digest = sha_strings(lines)
640
if lines[-1][-1] != '\n':
641
options.append('no-eol')
642
lines[-1] = lines[-1] + '\n'
644
if len(present_parents) and delta:
645
# To speed the extract of texts the delta chain is limited
646
# to a fixed number of deltas. This should minimize both
647
# I/O and the time spend applying deltas.
649
delta_parents = present_parents
651
parent = delta_parents[0]
652
method = self._index.get_method(parent)
653
if method == 'fulltext':
655
delta_parents = self._index.get_parents(parent)
657
if method == 'line-delta':
660
lines = self.factory.make(lines, version_id)
661
if delta or (self.factory.annotated and len(present_parents) > 0):
662
# Merge annotations from parent texts if so is needed.
663
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
664
delta, self.factory.annotated)
667
options.append('line-delta')
668
store_lines = self.factory.lower_line_delta(delta_hunks)
670
options.append('fulltext')
671
store_lines = self.factory.lower_fulltext(lines)
673
where, size = self._data.add_record(version_id, digest, store_lines)
674
self._index.add_version(version_id, options, where, size, parents)
677
def check(self, progress_bar=None):
678
"""See VersionedFile.check()."""
680
def _clone_text(self, new_version_id, old_version_id, parents):
681
"""See VersionedFile.clone_text()."""
682
# FIXME RBC 20060228 make fast by only inserting an index with null delta.
683
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
685
def get_lines(self, version_id):
686
"""See VersionedFile.get_lines()."""
687
return self._get_content(version_id).text()
689
def iter_lines_added_or_present_in_versions(self, version_ids=None):
690
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
691
if version_ids is None:
692
version_ids = self.versions()
693
# we dont care about inclusions, the caller cares.
694
# but we need to setup a list of records to visit.
695
# we need version_id, position, length
696
version_id_records = []
697
requested_versions = list(version_ids)
698
# filter for available versions
699
for version_id in requested_versions:
700
if not self.has_version(version_id):
701
raise RevisionNotPresent(version_id, self.filename)
702
# get a in-component-order queue:
704
for version_id in self.versions():
705
if version_id in requested_versions:
706
version_ids.append(version_id)
707
data_pos, length = self._index.get_position(version_id)
708
version_id_records.append((version_id, data_pos, length))
710
pb = bzrlib.ui.ui_factory.nested_progress_bar()
712
total = len(version_id_records)
714
pb.update('Walking content.', count, total)
715
for version_id, data, sha_value in \
716
self._data.read_records_iter(version_id_records):
717
pb.update('Walking content.', count, total)
718
method = self._index.get_method(version_id)
719
version_idx = self._index.lookup(version_id)
720
assert method in ('fulltext', 'line-delta')
721
if method == 'fulltext':
722
content = self.factory.parse_fulltext(data, version_idx)
723
for line in content.text():
726
delta = self.factory.parse_line_delta(data, version_idx)
727
for start, end, count, lines in delta:
728
for origin, line in lines:
731
pb.update('Walking content.', total, total)
734
pb.update('Walking content.', total, total)
738
def num_versions(self):
739
"""See VersionedFile.num_versions()."""
740
return self._index.num_versions()
742
__len__ = num_versions
744
def annotate_iter(self, version_id):
745
"""See VersionedFile.annotate_iter."""
746
content = self._get_content(version_id)
747
for origin, text in content.annotate_iter():
750
def get_parents(self, version_id):
751
"""See VersionedFile.get_parents."""
752
self._check_versions_present([version_id])
753
return list(self._index.get_parents(version_id))
755
def get_parents_with_ghosts(self, version_id):
756
"""See VersionedFile.get_parents."""
757
self._check_versions_present([version_id])
758
return list(self._index.get_parents_with_ghosts(version_id))
760
def get_ancestry(self, versions):
761
"""See VersionedFile.get_ancestry."""
762
if isinstance(versions, basestring):
763
versions = [versions]
766
self._check_versions_present(versions)
767
return self._index.get_ancestry(versions)
769
def get_ancestry_with_ghosts(self, versions):
770
"""See VersionedFile.get_ancestry_with_ghosts."""
771
if isinstance(versions, basestring):
772
versions = [versions]
775
self._check_versions_present(versions)
776
return self._index.get_ancestry_with_ghosts(versions)
778
#@deprecated_method(zero_eight)
779
def walk(self, version_ids):
780
"""See VersionedFile.walk."""
781
# We take the short path here, and extract all relevant texts
782
# and put them in a weave and let that do all the work. Far
783
# from optimal, but is much simpler.
784
# FIXME RB 20060228 this really is inefficient!
785
from bzrlib.weave import Weave
787
w = Weave(self.filename)
788
ancestry = self.get_ancestry(version_ids)
789
sorted_graph = topo_sort(self._index.get_graph())
790
version_list = [vid for vid in sorted_graph if vid in ancestry]
792
for version_id in version_list:
793
lines = self.get_lines(version_id)
794
w.add_lines(version_id, self.get_parents(version_id), lines)
796
for lineno, insert_id, dset, line in w.walk(version_ids):
797
yield lineno, insert_id, dset, line
800
class _KnitComponentFile(object):
801
"""One of the files used to implement a knit database"""
803
def __init__(self, transport, filename, mode):
804
self._transport = transport
805
self._filename = filename
808
def write_header(self):
809
if self._transport.append(self._filename, StringIO(self.HEADER)):
810
raise KnitCorrupt(self._filename, 'misaligned after writing header')
812
def check_header(self, fp):
813
line = fp.read(len(self.HEADER))
814
if line != self.HEADER:
815
raise KnitHeaderError(badline=line)
818
"""Commit is a nop."""
821
return '%s(%s)' % (self.__class__.__name__, self._filename)
824
class _KnitIndex(_KnitComponentFile):
825
"""Manages knit index file.
827
The index is already kept in memory and read on startup, to enable
828
fast lookups of revision information. The cursor of the index
829
file is always pointing to the end, making it easy to append
832
_cache is a cache for fast mapping from version id to a Index
835
_history is a cache for fast mapping from indexes to version ids.
837
The index data format is dictionary compressed when it comes to
838
parent references; a index entry may only have parents that with a
839
lover index number. As a result, the index is topological sorted.
841
Duplicate entries may be written to the index for a single version id
842
if this is done then the latter one completely replaces the former:
843
this allows updates to correct version and parent information.
844
Note that the two entries may share the delta, and that successive
845
annotations and references MUST point to the first entry.
848
HEADER = "# bzr knit index 7\n"
850
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
851
# __slots__ = ['_cache', '_history', '_transport', '_filename']
853
def _cache_version(self, version_id, options, pos, size, parents):
854
"""Cache a version record in the history array and index cache.
856
This is inlined into __init__ for performance. KEEP IN SYNC.
857
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
860
# only want the _history index to reference the 1st index entry
862
if version_id not in self._cache:
863
self._history.append(version_id)
864
self._cache[version_id] = (version_id, options, pos, size, parents)
866
def __init__(self, transport, filename, mode, create=False):
867
_KnitComponentFile.__init__(self, transport, filename, mode)
869
# position in _history is the 'official' index for a revision
870
# but the values may have come from a newer entry.
871
# so - wc -l of a knit index is != the number of uniqe names
874
pb = bzrlib.ui.ui_factory.nested_progress_bar()
879
pb.update('read knit index', count, total)
880
fp = self._transport.get(self._filename)
881
self.check_header(fp)
882
# readlines reads the whole file at once:
883
# bad for transports like http, good for local disk
884
# we save 60 ms doing this one change (
885
# from calling readline each time to calling
887
# probably what we want for nice behaviour on
888
# http is a incremental readlines that yields, or
889
# a check for local vs non local indexes,
890
for l in fp.readlines():
894
#pb.update('read knit index', count, total)
895
# See self._parse_parents
897
for value in rec[4:]:
899
# uncompressed reference
900
parents.append(value[1:])
902
# this is 15/4000ms faster than isinstance,
904
# this function is called thousands of times a
905
# second so small variations add up.
906
assert value.__class__ is str
907
parents.append(self._history[int(value)])
908
# end self._parse_parents
909
# self._cache_version(rec[0],
914
# --- self._cache_version
915
# only want the _history index to reference the 1st
916
# index entry for version_id
918
if version_id not in self._cache:
919
self._history.append(version_id)
920
self._cache[version_id] = (version_id,
925
# --- self._cache_version
926
except NoSuchFile, e:
927
if mode != 'w' or not create:
931
pb.update('read knit index', total, total)
934
def _parse_parents(self, compressed_parents):
935
"""convert a list of string parent values into version ids.
937
ints are looked up in the index.
938
.FOO values are ghosts and converted in to FOO.
940
NOTE: the function is retained here for clarity, and for possible
941
use in partial index reads. However bulk processing now has
942
it inlined in __init__ for inner-loop optimisation.
945
for value in compressed_parents:
947
# uncompressed reference
948
result.append(value[1:])
950
# this is 15/4000ms faster than isinstance,
951
# this function is called thousands of times a
952
# second so small variations add up.
953
assert value.__class__ is str
954
result.append(self._history[int(value)])
959
for version_id, index in self._cache.iteritems():
960
graph.append((version_id, index[4]))
963
def get_ancestry(self, versions):
964
"""See VersionedFile.get_ancestry."""
965
# get a graph of all the mentioned versions:
967
pending = set(versions)
969
version = pending.pop()
970
parents = self._cache[version][4]
973
parents = [parent for parent in parents if parent in self._cache]
974
for parent in parents:
975
# if not completed and not a ghost
976
if parent not in graph:
978
graph[version] = parents
979
return topo_sort(graph.items())
981
def get_ancestry_with_ghosts(self, versions):
982
"""See VersionedFile.get_ancestry_with_ghosts."""
983
# get a graph of all the mentioned versions:
985
pending = set(versions)
987
version = pending.pop()
989
parents = self._cache[version][4]
996
for parent in parents:
997
if parent not in graph:
999
graph[version] = parents
1000
return topo_sort(graph.items())
1002
def num_versions(self):
1003
return len(self._history)
1005
__len__ = num_versions
1007
def get_versions(self):
1008
return self._history
1010
def idx_to_name(self, idx):
1011
return self._history[idx]
1013
def lookup(self, version_id):
1014
assert version_id in self._cache
1015
return self._history.index(version_id)
1017
def _version_list_to_index(self, versions):
1019
for version in versions:
1020
if version in self._cache:
1021
result_list.append(str(self._history.index(version)))
1023
result_list.append('.' + version.encode('utf-8'))
1024
return ' '.join(result_list)
1026
def add_version(self, version_id, options, pos, size, parents):
1027
"""Add a version record to the index."""
1028
self._cache_version(version_id, options, pos, size, parents)
1030
content = "%s %s %s %s %s\n" % (version_id.encode('utf-8'),
1034
self._version_list_to_index(parents))
1035
assert isinstance(content, str), 'content must be utf-8 encoded'
1036
self._transport.append(self._filename, StringIO(content))
1038
def has_version(self, version_id):
1039
"""True if the version is in the index."""
1040
return self._cache.has_key(version_id)
1042
def get_position(self, version_id):
1043
"""Return data position and size of specified version."""
1044
return (self._cache[version_id][2], \
1045
self._cache[version_id][3])
1047
def get_method(self, version_id):
1048
"""Return compression method of specified version."""
1049
options = self._cache[version_id][1]
1050
if 'fulltext' in options:
1053
assert 'line-delta' in options
1056
def get_options(self, version_id):
1057
return self._cache[version_id][1]
1059
def get_parents(self, version_id):
1060
"""Return parents of specified version ignoring ghosts."""
1061
return [parent for parent in self._cache[version_id][4]
1062
if parent in self._cache]
1064
def get_parents_with_ghosts(self, version_id):
1065
"""Return parents of specified version wth ghosts."""
1066
return self._cache[version_id][4]
1068
def check_versions_present(self, version_ids):
1069
"""Check that all specified versions are present."""
1070
version_ids = set(version_ids)
1071
for version_id in list(version_ids):
1072
if version_id in self._cache:
1073
version_ids.remove(version_id)
1075
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1078
class _KnitData(_KnitComponentFile):
1079
"""Contents of the knit data file"""
1081
HEADER = "# bzr knit data 7\n"
1083
def __init__(self, transport, filename, mode, create=False):
1084
_KnitComponentFile.__init__(self, transport, filename, mode)
1086
self._checked = False
1088
self._transport.put(self._filename, StringIO(''))
1091
def clear_cache(self):
1092
"""Clear the record cache."""
1095
def _open_file(self):
1096
if self._file is None:
1098
self._file = self._transport.get(self._filename)
1103
def _record_to_data(self, version_id, digest, lines):
1104
"""Convert version_id, digest, lines into a raw data block.
1106
:return: (len, a StringIO instance with the raw data ready to read.)
1109
data_file = GzipFile(None, mode='wb', fileobj=sio)
1110
data_file.writelines(chain(
1111
["version %s %d %s\n" % (version_id.encode('utf-8'),
1115
["end %s\n\n" % version_id.encode('utf-8')]))
1122
def add_raw_record(self, raw_data):
1123
"""Append a prepared record to the data file."""
1124
assert isinstance(raw_data, str), 'data must be plain bytes'
1125
start_pos = self._transport.append(self._filename, StringIO(raw_data))
1126
return start_pos, len(raw_data)
1128
def add_record(self, version_id, digest, lines):
1129
"""Write new text record to disk. Returns the position in the
1130
file where it was written."""
1131
size, sio = self._record_to_data(version_id, digest, lines)
1133
self._records[version_id] = (digest, lines)
1135
start_pos = self._transport.append(self._filename, sio)
1136
return start_pos, size
1138
def _parse_record_header(self, version_id, raw_data):
1139
"""Parse a record header for consistency.
1141
:return: the header and the decompressor stream.
1142
as (stream, header_record)
1144
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1145
rec = df.readline().split()
1147
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1148
if rec[1].decode('utf-8')!= version_id:
1149
raise KnitCorrupt(self._filename,
1150
'unexpected version, wanted %r, got %r' % (
1151
version_id, rec[1]))
1154
def _parse_record(self, version_id, data):
1155
df, rec = self._parse_record_header(version_id, data)
1157
record_contents = self._read_record_contents(df, lines)
1159
if l.decode('utf-8') != 'end %s\n' % version_id:
1160
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1163
return record_contents, rec[3]
1165
def _read_record_contents(self, df, record_lines):
1166
"""Read and return n lines from datafile."""
1168
for i in range(record_lines):
1169
r.append(df.readline())
1172
def read_records_iter_raw(self, records):
1173
"""Read text records from data file and yield raw data.
1175
This unpacks enough of the text record to validate the id is
1176
as expected but thats all.
1178
It will actively recompress currently cached records on the
1179
basis that that is cheaper than I/O activity.
1182
for version_id, pos, size in records:
1183
if version_id not in self._records:
1184
needed_records.append((version_id, pos, size))
1186
# setup an iterator of the external records:
1187
# uses readv so nice and fast we hope.
1188
if len(needed_records):
1189
# grab the disk data needed.
1190
raw_records = self._transport.readv(self._filename,
1191
[(pos, size) for version_id, pos, size in needed_records])
1193
for version_id, pos, size in records:
1194
if version_id in self._records:
1195
# compress a new version
1196
size, sio = self._record_to_data(version_id,
1197
self._records[version_id][0],
1198
self._records[version_id][1])
1199
yield version_id, sio.getvalue()
1201
pos, data = raw_records.next()
1202
# validate the header
1203
df, rec = self._parse_record_header(version_id, data)
1205
yield version_id, data
1208
def read_records_iter(self, records):
1209
"""Read text records from data file and yield result.
1211
Each passed record is a tuple of (version_id, pos, len) and
1212
will be read in the given order. Yields (version_id,
1217
for version_id, pos, size in records:
1218
if version_id not in self._records:
1219
needed_records.append((version_id, pos, size))
1221
if len(needed_records):
1222
# We take it that the transport optimizes the fetching as good
1223
# as possible (ie, reads continous ranges.)
1224
response = self._transport.readv(self._filename,
1225
[(pos, size) for version_id, pos, size in needed_records])
1227
for (record_id, pos, size), (pos, data) in izip(iter(needed_records), response):
1228
content, digest = self._parse_record(record_id, data)
1229
self._records[record_id] = (digest, content)
1231
for version_id, pos, size in records:
1232
yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])
1234
def read_records(self, records):
1235
"""Read records into a dictionary."""
1237
for record_id, content, digest in self.read_records_iter(records):
1238
components[record_id] = (content, digest)
1242
class InterKnit(InterVersionedFile):
1243
"""Optimised code paths for knit to knit operations."""
1245
_matching_file_factory = KnitVersionedFile
1248
def is_compatible(source, target):
1249
"""Be compatible with knits. """
1251
return (isinstance(source, KnitVersionedFile) and
1252
isinstance(target, KnitVersionedFile))
1253
except AttributeError:
1256
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1257
"""See InterVersionedFile.join."""
1258
assert isinstance(self.source, KnitVersionedFile)
1259
assert isinstance(self.target, KnitVersionedFile)
1261
if version_ids is None:
1262
version_ids = self.source.versions()
1264
if not ignore_missing:
1265
self.source._check_versions_present(version_ids)
1267
version_ids = set(self.source.versions()).intersection(
1273
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1275
version_ids = list(version_ids)
1276
if None in version_ids:
1277
version_ids.remove(None)
1279
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1280
this_versions = set(self.target._index.get_versions())
1281
needed_versions = self.source_ancestry - this_versions
1282
cross_check_versions = self.source_ancestry.intersection(this_versions)
1283
mismatched_versions = set()
1284
for version in cross_check_versions:
1285
# scan to include needed parents.
1286
n1 = set(self.target.get_parents_with_ghosts(version))
1287
n2 = set(self.source.get_parents_with_ghosts(version))
1289
# FIXME TEST this check for cycles being introduced works
1290
# the logic is we have a cycle if in our graph we are an
1291
# ancestor of any of the n2 revisions.
1297
parent_ancestors = self.source.get_ancestry(parent)
1298
if version in parent_ancestors:
1299
raise errors.GraphCycleError([parent, version])
1300
# ensure this parent will be available later.
1301
new_parents = n2.difference(n1)
1302
needed_versions.update(new_parents.difference(this_versions))
1303
mismatched_versions.add(version)
1305
if not needed_versions and not cross_check_versions:
1307
full_list = topo_sort(self.source.get_graph())
1309
version_list = [i for i in full_list if (not self.target.has_version(i)
1310
and i in needed_versions)]
1314
copy_queue_records = []
1316
for version_id in version_list:
1317
options = self.source._index.get_options(version_id)
1318
parents = self.source._index.get_parents_with_ghosts(version_id)
1319
# check that its will be a consistent copy:
1320
for parent in parents:
1321
# if source has the parent, we must :
1322
# * already have it or
1323
# * have it scheduled already
1324
# otherwise we dont care
1325
assert (self.target.has_version(parent) or
1326
parent in copy_set or
1327
not self.source.has_version(parent))
1328
data_pos, data_size = self.source._index.get_position(version_id)
1329
copy_queue_records.append((version_id, data_pos, data_size))
1330
copy_queue.append((version_id, options, parents))
1331
copy_set.add(version_id)
1333
# data suck the join:
1335
total = len(version_list)
1336
# we want the raw gzip for bulk copying, but the record validated
1337
# just enough to be sure its the right one.
1338
# TODO: consider writev or write combining to reduce
1339
# death of a thousand cuts feeling.
1340
for (version_id, raw_data), \
1341
(version_id2, options, parents) in \
1342
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1344
assert version_id == version_id2, 'logic error, inconsistent results'
1346
pb.update("Joining knit", count, total)
1347
pos, size = self.target._data.add_raw_record(raw_data)
1348
self.target._index.add_version(version_id, options, pos, size, parents)
1350
for version in mismatched_versions:
1351
# FIXME RBC 20060309 is this needed?
1352
n1 = set(self.target.get_parents_with_ghosts(version))
1353
n2 = set(self.source.get_parents_with_ghosts(version))
1354
# write a combined record to our history preserving the current
1355
# parents as first in the list
1356
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1357
self.target.fix_parents(version, new_parents)
1363
InterVersionedFile.register_optimiser(InterKnit)
1366
# make GzipFile faster:
1368
class GzipFile(gzip.GzipFile):
1369
"""Knit tuned version of GzipFile.
1371
This is based on the following lsprof stats:
1372
python 2.4 stock GzipFile write:
1373
58971 0 5644.3090 2721.4730 gzip:193(write)
1374
+58971 0 1159.5530 1159.5530 +<built-in method compress>
1375
+176913 0 987.0320 987.0320 +<len>
1376
+58971 0 423.1450 423.1450 +<zlib.crc32>
1377
+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.
1379
tuned GzipFile write:
1380
58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)
1381
+58971 0 1297.7620 1297.7620 +<built-in method compress>
1382
+58971 0 406.2160 406.2160 +<zlib.crc32>
1383
+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.
1385
+58971 0 328.2670 328.2670 +<len>
1388
Yes, its only 1.6 seconds, but they add up.
1391
def write(self, data):
1392
if self.mode != gzip.WRITE:
1394
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
1396
if self.fileobj is None:
1397
raise ValueError, "write() on closed GzipFile object"
1398
data_len = len(data)
1400
self.size = self.size + data_len
1401
self.crc = zlib.crc32(data, self.crc)
1402
self.fileobj.write( self.compress.compress(data) )
1403
self.offset += data_len
1405
def writelines(self, lines):
1406
# profiling indicated a significant overhead
1407
# calling write for each line.
1408
# this batch call is a lot faster :).
1409
# (4 seconds to 1 seconds for the sample upgrades I was testing).
1410
self.write(''.join(lines))
1413
class SequenceMatcher(difflib.SequenceMatcher):
1414
"""Knit tuned sequence matcher.
1416
This is based on profiling of difflib which indicated some improvements
1417
for our usage pattern.
1420
def find_longest_match(self, alo, ahi, blo, bhi):
1421
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1423
If isjunk is not defined:
1425
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1426
alo <= i <= i+k <= ahi
1427
blo <= j <= j+k <= bhi
1428
and for all (i',j',k') meeting those conditions,
1431
and if i == i', j <= j'
1433
In other words, of all maximal matching blocks, return one that
1434
starts earliest in a, and of all those maximal matching blocks that
1435
start earliest in a, return the one that starts earliest in b.
1437
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1438
>>> s.find_longest_match(0, 5, 0, 9)
1441
If isjunk is defined, first the longest matching block is
1442
determined as above, but with the additional restriction that no
1443
junk element appears in the block. Then that block is extended as
1444
far as possible by matching (only) junk elements on both sides. So
1445
the resulting block never matches on junk except as identical junk
1446
happens to be adjacent to an "interesting" match.
1448
Here's the same example as before, but considering blanks to be
1449
junk. That prevents " abcd" from matching the " abcd" at the tail
1450
end of the second sequence directly. Instead only the "abcd" can
1451
match, and matches the leftmost "abcd" in the second sequence:
1453
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1454
>>> s.find_longest_match(0, 5, 0, 9)
1457
If no blocks match, return (alo, blo, 0).
1459
>>> s = SequenceMatcher(None, "ab", "c")
1460
>>> s.find_longest_match(0, 2, 0, 1)
1464
# CAUTION: stripping common prefix or suffix would be incorrect.
1468
# Longest matching block is "ab", but if common prefix is
1469
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1470
# strip, so ends up claiming that ab is changed to acab by
1471
# inserting "ca" in the middle. That's minimal but unintuitive:
1472
# "it's obvious" that someone inserted "ac" at the front.
1473
# Windiff ends up at the same place as diff, but by pairing up
1474
# the unique 'b's and then matching the first two 'a's.
1476
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1477
besti, bestj, bestsize = alo, blo, 0
1478
# find longest junk-free match
1479
# during an iteration of the loop, j2len[j] = length of longest
1480
# junk-free match ending with a[i-1] and b[j]
1484
for i in xrange(alo, ahi):
1485
# look at all instances of a[i] in b; note that because
1486
# b2j has no junk keys, the loop is skipped if a[i] is junk
1487
j2lenget = j2len.get
1490
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1491
# following improvement
1492
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1493
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1494
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1496
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1497
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1498
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1510
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1512
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1515
# Extend the best by non-junk elements on each end. In particular,
1516
# "popular" non-junk elements aren't in b2j, which greatly speeds
1517
# the inner loop above, but also means "the best" match so far
1518
# doesn't contain any junk *or* popular non-junk elements.
1519
while besti > alo and bestj > blo and \
1520
not isbjunk(b[bestj-1]) and \
1521
a[besti-1] == b[bestj-1]:
1522
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1523
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1524
not isbjunk(b[bestj+bestsize]) and \
1525
a[besti+bestsize] == b[bestj+bestsize]:
1528
# Now that we have a wholly interesting match (albeit possibly
1529
# empty!), we may as well suck up the matching junk on each
1530
# side of it too. Can't think of a good reason not to, and it
1531
# saves post-processing the (possibly considerable) expense of
1532
# figuring out what to do with it. In the case of an empty
1533
# interesting match, this is clearly the right thing to do,
1534
# because no other kind of match is possible in the regions.
1535
while besti > alo and bestj > blo and \
1536
isbjunk(b[bestj-1]) and \
1537
a[besti-1] == b[bestj-1]:
1538
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1539
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1540
isbjunk(b[bestj+bestsize]) and \
1541
a[besti+bestsize] == b[bestj+bestsize]:
1542
bestsize = bestsize + 1
1544
return besti, bestj, bestsize