1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
"""Knit versionedfile implementation.
22
A knit is a versioned file implementation that supports efficient append only
28
from difflib import SequenceMatcher
30
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
31
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
32
RevisionNotPresent, RevisionAlreadyPresent
33
from bzrlib.trace import mutter
34
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
36
from bzrlib.versionedfile import VersionedFile
37
from bzrlib.tsort import topo_sort
39
from StringIO import StringIO
40
from gzip import GzipFile
43
# TODO: Split out code specific to this format into an associated object.
45
# TODO: Can we put in some kind of value to check that the index and data
46
# files belong together?
48
# TODO: accomodate binaries, perhaps by storing a byte count
50
# TODO: function to check whole file
52
# TODO: atomically append data, then measure backwards from the cursor
53
# position after writing to work out where it was located. we may need to
54
# bypass python file buffering.
57
INDEX_SUFFIX = '.kndx'
60
class KnitContent(object):
61
"""Content of a knit version to which deltas can be applied."""
63
def __init__(self, lines):
66
def annotate_iter(self):
67
"""Yield tuples of (origin, text) for each content line."""
68
for origin, text in self._lines:
72
"""Return a list of (origin, text) tuples."""
73
return list(self.annotate_iter())
75
def apply_delta(self, delta):
76
"""Apply delta to this content."""
78
for start, end, count, lines in delta:
79
self._lines[offset+start:offset+end] = lines
80
offset = offset + (start - end) + count
82
def line_delta_iter(self, new_lines):
83
"""Generate line-based delta from new_lines to this content."""
84
new_texts = [text for origin, text in new_lines._lines]
85
old_texts = [text for origin, text in self._lines]
86
s = difflib.SequenceMatcher(None, old_texts, new_texts)
87
for op in s.get_opcodes():
90
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
92
def line_delta(self, new_lines):
93
return list(self.line_delta_iter(new_lines))
96
return [text for origin, text in self._lines]
99
class _KnitFactory(object):
100
"""Base factory for creating content objects."""
102
def make(self, lines, version):
103
num_lines = len(lines)
104
return KnitContent(zip([version] * num_lines, lines))
107
class KnitAnnotateFactory(_KnitFactory):
108
"""Factory for creating annotated Content objects."""
112
def parse_fulltext(self, content, version):
115
origin, text = line.split(' ', 1)
116
lines.append((int(origin), text))
117
return KnitContent(lines)
119
def parse_line_delta_iter(self, lines):
121
header = lines.pop(0)
122
start, end, c = [int(n) for n in header.split(',')]
125
origin, text = lines.pop(0).split(' ', 1)
126
contents.append((int(origin), text))
127
yield start, end, c, contents
129
def parse_line_delta(self, lines, version):
130
return list(self.parse_line_delta_iter(lines))
132
def lower_fulltext(self, content):
133
return ['%d %s' % (o, t) for o, t in content._lines]
135
def lower_line_delta(self, delta):
137
for start, end, c, lines in delta:
138
out.append('%d,%d,%d\n' % (start, end, c))
139
for origin, text in lines:
140
out.append('%d %s' % (origin, text))
144
class KnitPlainFactory(_KnitFactory):
145
"""Factory for creating plain Content objects."""
149
def parse_fulltext(self, content, version):
150
return self.make(content, version)
152
def parse_line_delta_iter(self, lines, version):
154
header = lines.pop(0)
155
start, end, c = [int(n) for n in header.split(',')]
156
yield start, end, c, zip([version] * c, lines[:c])
159
def parse_line_delta(self, lines, version):
160
return list(self.parse_line_delta_iter(lines, version))
162
def lower_fulltext(self, content):
163
return content.text()
165
def lower_line_delta(self, delta):
167
for start, end, c, lines in delta:
168
out.append('%d,%d,%d\n' % (start, end, c))
169
out.extend([text for origin, text in lines])
173
def make_empty_knit(transport, relpath):
174
"""Construct a empty knit at the specified location."""
175
from bzrlib.transactions import PassThroughTransaction
176
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory, \
177
PassThroughTransaction())
181
class KnitVersionedFile(VersionedFile):
182
"""Weave-like structure with faster random access.
184
A knit stores a number of texts and a summary of the relationships
185
between them. Texts are identified by a string version-id. Texts
186
are normally stored and retrieved as a series of lines, but can
187
also be passed as single strings.
189
Lines are stored with the trailing newline (if any) included, to
190
avoid special cases for files with no final newline. Lines are
191
composed of 8-bit characters, not unicode. The combination of
192
these approaches should mean any 'binary' file can be safely
193
stored and retrieved.
196
def __init__(self, transport, relpath, mode, factory, transaction,
197
basis_knit=None, delta=True):
198
"""Construct a knit at location specified by relpath. The
199
given transaction will be used throughout the lifetime of the
202
assert mode in ('r', 'w'), "invalid mode specified"
203
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
206
self.transaction = transaction
207
self.transport = transport
208
self.filename = relpath
209
self.basis_knit = basis_knit
210
self.factory = factory
211
self.writable = (mode == 'w')
214
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
216
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
220
"""See VersionedFile.versions."""
221
return self._index.get_versions()
223
def has_version(self, version_id):
224
"""See VersionedFile.has_version."""
225
return self._index.has_version(version_id)
227
__contains__ = has_version
229
def _merge_annotations(self, content, parents):
230
"""Merge annotations for content. This is done by comparing
231
the annotations based on changed to the text."""
232
for parent_id in parents:
233
merge_content = self._get_content(parent_id)
234
seq = SequenceMatcher(None, merge_content.text(), content.text())
235
for i, j, n in seq.get_matching_blocks():
238
content._lines[j:j+n] = merge_content._lines[i:i+n]
240
def _get_components(self, version_id):
241
"""Return a list of (version_id, method, data) tuples that
242
makes up version specified by version_id of the knit.
244
The components should be applied in the order of the returned
247
The basis knit will be used to the largest extent possible
248
since it is assumed that accesses to it is faster.
250
# needed_revisions holds a list of (method, version_id) of
251
# versions that is needed to be fetched to construct the final
252
# version of the file.
254
# basis_revisions is a list of versions that needs to be
255
# fetched but exists in the basis knit.
257
basis = self.basis_knit
264
if basis and basis._index.has_version(cursor):
266
basis_versions.append(cursor)
267
method = picked_knit._index.get_method(cursor)
268
needed_versions.append((method, cursor))
269
if method == 'fulltext':
271
cursor = picked_knit.get_parents(cursor)[0]
276
for comp_id in basis_versions:
277
data_pos, data_size = basis._index.get_data_position(comp_id)
278
records.append((piece_id, data_pos, data_size))
279
components.update(basis._data.read_records(records))
282
for comp_id in [vid for method, vid in needed_versions
283
if vid not in basis_versions]:
284
data_pos, data_size = self._index.get_position(comp_id)
285
records.append((comp_id, data_pos, data_size))
286
components.update(self._data.read_records(records))
288
# get_data_records returns a mapping with the version id as
289
# index and the value as data. The order the components need
290
# to be applied is held by needed_versions (reversed).
292
for method, comp_id in reversed(needed_versions):
293
out.append((comp_id, method, components[comp_id]))
297
def _get_content(self, version_id):
298
"""Returns a content object that makes up the specified
300
if not self.has_version(version_id):
301
raise RevisionNotPresent(version_id, self.filename)
303
if self.basis_knit and version_id in self.basis_knit:
304
return self.basis_knit._get_content(version_id)
307
components = self._get_components(version_id)
308
for component_id, method, (data, digest) in components:
309
version_idx = self._index.lookup(component_id)
310
if method == 'fulltext':
311
assert content is None
312
content = self.factory.parse_fulltext(data, version_idx)
313
elif method == 'line-delta':
314
delta = self.factory.parse_line_delta(data, version_idx)
315
content.apply_delta(delta)
317
if 'no-eol' in self._index.get_options(version_id):
318
line = content._lines[-1][1].rstrip('\n')
319
content._lines[-1] = (content._lines[-1][0], line)
321
if sha_strings(content.text()) != digest:
322
raise KnitCorrupt(self.filename, 'sha-1 does not match')
326
def clone_text(self, new_version_id, old_version_id, parents,
328
"""See VersionedFile.clone_text."""
329
raise NotImplementedError
331
def _check_versions_present(self, version_ids):
332
"""Check that all specified versions are present."""
333
version_ids = set(version_ids)
334
for r in list(version_ids):
335
if self._index.has_version(r):
336
version_ids.remove(r)
338
raise RevisionNotPresent(list(version_ids)[0], self.filename)
340
def add_lines(self, version_id, parents, lines):
341
"""See VersionedFile.add_lines."""
342
assert self.writable, "knit is not opened for write"
343
### FIXME escape. RBC 20060228
344
if contains_whitespace(version_id):
345
raise InvalidRevisionId(version_id)
346
if self.has_version(version_id):
347
raise RevisionAlreadyPresent(version_id, self.filename)
349
if True or __debug__:
351
assert '\n' not in l[:-1]
353
self._check_versions_present(parents)
354
return self._add(version_id, lines[:], parents, self.delta)
356
def _add(self, version_id, lines, parents, delta):
357
"""Add a set of lines on top of version specified by parents.
359
If delta is true, compress the text as a line-delta against
362
if delta and not parents:
365
digest = sha_strings(lines)
368
if lines[-1][-1] != '\n':
369
options.append('no-eol')
370
lines[-1] = lines[-1] + '\n'
372
lines = self.factory.make(lines, len(self._index))
373
if self.factory.annotated and len(parents) > 0:
374
# Merge annotations from parent texts if so is needed.
375
self._merge_annotations(lines, parents)
377
if parents and delta:
378
# To speed the extract of texts the delta chain is limited
379
# to a fixed number of deltas. This should minimize both
380
# I/O and the time spend applying deltas.
382
delta_parents = parents
384
parent = delta_parents[0]
385
method = self._index.get_method(parent)
386
if method == 'fulltext':
388
delta_parents = self._index.get_parents(parent)
390
if method == 'line-delta':
394
options.append('line-delta')
395
content = self._get_content(parents[0])
396
delta_hunks = content.line_delta(lines)
397
store_lines = self.factory.lower_line_delta(delta_hunks)
399
options.append('fulltext')
400
store_lines = self.factory.lower_fulltext(lines)
402
where, size = self._data.add_record(version_id, digest, store_lines)
403
self._index.add_version(version_id, options, where, size, parents)
405
def clone_text(self, new_version_id, old_version_id, parents, transaction):
406
"""See VersionedFile.clone_text()."""
407
# FIXME RBC 20060228 make fast by only inserting an index with null delta.
408
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
410
def get_lines(self, version_id):
411
"""See VersionedFile.get_lines()."""
412
return self._get_content(version_id).text()
414
def annotate_iter(self, version_id):
415
"""See VersionedFile.annotate_iter."""
416
content = self._get_content(version_id)
417
for origin, text in content.annotate_iter():
418
yield self._index.idx_to_name(origin), text
420
def get_parents(self, version_id):
421
"""See VersionedFile.get_parents."""
422
self._check_versions_present([version_id])
423
return list(self._index.get_parents(version_id))
425
def get_ancestry(self, versions):
426
"""See VersionedFile.get_ancestry."""
427
if isinstance(versions, basestring):
428
versions = [versions]
431
self._check_versions_present(versions)
432
return self._index.get_ancestry(versions)
434
def _reannotate_line_delta(self, other, lines, new_version_id,
436
"""Re-annotate line-delta and return new delta."""
438
for start, end, count, contents \
439
in self.factory.parse_line_delta_iter(lines):
441
for origin, line in contents:
442
old_version_id = other._index.idx_to_name(origin)
443
if old_version_id == new_version_id:
444
idx = new_version_idx
446
idx = self._index.lookup(old_version_id)
447
new_lines.append((idx, line))
448
new_delta.append((start, end, count, new_lines))
450
return self.factory.lower_line_delta(new_delta)
452
def _reannotate_fulltext(self, other, lines, new_version_id,
454
"""Re-annotate fulltext and return new version."""
455
content = self.factory.parse_fulltext(lines, new_version_idx)
457
for origin, line in content.annotate_iter():
458
old_version_id = other._index.idx_to_name(origin)
459
if old_version_id == new_version_id:
460
idx = new_version_idx
462
idx = self._index.lookup(old_version_id)
463
new_lines.append((idx, line))
465
return self.factory.lower_fulltext(KnitContent(new_lines))
467
def join(self, other, pb=None, msg=None, version_ids=None):
468
"""See VersionedFile.join."""
469
assert isinstance(other, KnitVersionedFile)
471
if version_ids is None:
472
version_ids = other.versions()
477
from bzrlib.progress import DummyProgress
480
version_ids = list(version_ids)
481
if None in version_ids:
482
version_ids.remove(None)
484
other_ancestry = set(other.get_ancestry(version_ids))
485
needed_versions = other_ancestry - set(self._index.get_versions())
486
if not needed_versions:
488
full_list = topo_sort(other._index.get_graph())
490
version_list = [i for i in full_list if (not self.has_version(i)
491
and i in needed_versions)]
494
for version_id in version_list:
495
data_pos, data_size = other._index.get_position(version_id)
496
records.append((version_id, data_pos, data_size))
499
for version_id, lines, digest \
500
in other._data.read_records_iter(records):
501
options = other._index.get_options(version_id)
502
parents = other._index.get_parents(version_id)
504
for parent in parents:
505
assert self.has_version(parent)
507
if self.factory.annotated:
508
# FIXME jrydberg: it should be possible to skip
509
# re-annotating components if we know that we are
510
# going to pull all revisions in the same order.
511
new_version_id = version_id
512
new_version_idx = self._index.num_versions()
513
if 'fulltext' in options:
514
lines = self._reannotate_fulltext(other, lines,
515
new_version_id, new_version_idx)
516
elif 'line-delta' in options:
517
lines = self._reannotate_line_delta(other, lines,
518
new_version_id, new_version_idx)
521
pb.update(self.filename, count, len(version_list))
523
pos, size = self._data.add_record(version_id, digest, lines)
524
self._index.add_version(version_id, options, pos, size, parents)
529
def walk(self, version_ids):
530
"""See VersionedFile.walk."""
531
# We take the short path here, and extract all relevant texts
532
# and put them in a weave and let that do all the work. Far
533
# from optimal, but is much simpler.
534
from bzrlib.weave import Weave
535
from bzrlib.transactions import PassThroughTransaction
537
w = Weave(self.filename)
538
ancestry = self.get_ancestry(version_ids)
539
sorted_graph = topo_sort(self._index.get_graph())
540
version_list = [vid for vid in sorted_graph if vid in ancestry]
541
txn = PassThroughTransaction()
543
for version_id in version_list:
544
lines = self.get_lines(version_id)
545
w.add_lines(version_id, self.get_parents(version_id), lines)
547
for lineno, insert_id, dset, line in w.walk(version_ids):
548
yield lineno, insert_id, dset, line
551
class _KnitComponentFile(object):
552
"""One of the files used to implement a knit database"""
554
def __init__(self, transport, filename, mode, transaction):
555
self._transport = transport
556
self._filename = filename
557
self._transaction = transaction
560
def write_header(self):
561
old_len = self._transport.append(self._filename, self.HEADER)
563
raise KnitCorrupt(self._filename, 'misaligned after writing header')
565
def check_header(self, fp):
566
line = fp.read(len(self.HEADER))
567
if line != self.HEADER:
568
raise KnitHeaderError(badline=line)
571
"""Commit is a nop."""
574
return '%s(%s)' % (self.__class__.__name__, self._filename)
577
class _KnitIndex(_KnitComponentFile):
578
"""Manages knit index file.
580
The index is already kept in memory and read on startup, to enable
581
fast lookups of revision information. The cursor of the index
582
file is always pointing to the end, making it easy to append
585
_cache is a cache for fast mapping from version id to a Index
588
_history is a cache for fast mapping from indexes to version ids.
590
The index data format is dictionary compressed when it comes to
591
parent references; a index entry may only have parents that with a
592
lover index number. As a result, the index is topological sorted.
595
HEADER = "# bzr knit index 7\n"
597
def _cache_version(self, version_id, options, pos, size, parents):
598
val = (version_id, options, pos, size, parents)
599
self._cache[version_id] = val
600
self._history.append(version_id)
602
def _iter_index(self, fp):
604
for l in lines.splitlines(False):
607
def __init__(self, transport, filename, mode, transaction):
608
_KnitComponentFile.__init__(self, transport, filename, mode, transaction)
612
fp = self._transport.get(self._filename)
613
self.check_header(fp)
614
for rec in self._iter_index(fp):
615
self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),
616
[self._history[int(i)] for i in rec[4:]])
617
except NoSuchFile, e:
624
for version_id, index in self._cache.iteritems():
625
graph.append((version_id, index[4]))
628
def get_ancestry(self, versions):
629
"""See VersionedFile.get_ancestry."""
631
for version_id in versions:
632
version_idxs.append(self._history.index(version_id))
634
for v in xrange(max(version_idxs), 0, -1):
635
if self._history[v] in i:
636
# include all its parents
637
i.update(self._cache[self._history[v]][4])
640
def num_versions(self):
641
return len(self._history)
643
__len__ = num_versions
645
def get_versions(self):
648
def idx_to_name(self, idx):
649
return self._history[idx]
651
def lookup(self, version_id):
652
assert version_id in self._cache
653
return self._history.index(version_id)
655
def add_version(self, version_id, options, pos, size, parents):
656
"""Add a version record to the index."""
657
self._cache_version(version_id, options, pos, size, parents)
659
content = "%s %s %s %s %s\n" % (version_id,
663
' '.join([str(self.lookup(vid)) for
665
self._transport.append(self._filename, content)
667
def has_version(self, version_id):
668
"""True if the version is in the index."""
669
return self._cache.has_key(version_id)
671
def get_position(self, version_id):
672
"""Return data position and size of specified version."""
673
return (self._cache[version_id][2], \
674
self._cache[version_id][3])
676
def get_method(self, version_id):
677
"""Return compression method of specified version."""
678
options = self._cache[version_id][1]
679
if 'fulltext' in options:
682
assert 'line-delta' in options
685
def get_options(self, version_id):
686
return self._cache[version_id][1]
688
def get_parents(self, version_id):
689
"""Return parents of specified version."""
690
return self._cache[version_id][4]
692
def check_versions_present(self, version_ids):
693
"""Check that all specified versions are present."""
694
version_ids = set(version_ids)
695
for version_id in list(version_ids):
696
if version_id in self._cache:
697
version_ids.remove(version_id)
699
raise RevisionNotPresent(list(version_ids)[0], self.filename)
702
class _KnitData(_KnitComponentFile):
703
"""Contents of the knit data file"""
705
HEADER = "# bzr knit data 7\n"
707
def __init__(self, transport, filename, mode, transaction):
708
_KnitComponentFile.__init__(self, transport, filename, mode, transaction)
710
self._checked = False
712
def _open_file(self):
713
if self._file is None:
715
self._file = self._transport.get(self._filename)
720
def add_record(self, version_id, digest, lines):
721
"""Write new text record to disk. Returns the position in the
722
file where it was written."""
724
data_file = GzipFile(None, mode='wb', fileobj=sio)
725
print >>data_file, "version %s %d %s" % (version_id, len(lines), digest)
726
data_file.writelines(lines)
727
print >>data_file, "end %s\n" % version_id
730
content = sio.getvalue()
731
start_pos = self._transport.append(self._filename, content)
732
return start_pos, len(content)
734
def _parse_record(self, version_id, data):
735
df = GzipFile(mode='rb', fileobj=StringIO(data))
736
rec = df.readline().split()
738
raise KnitCorrupt(self._filename, 'unexpected number of records')
739
if rec[1] != version_id:
740
raise KnitCorrupt(self.file.name,
741
'unexpected version, wanted %r' % version_id)
743
record_contents = self._read_record_contents(df, lines)
745
if l != 'end %s\n' % version_id:
746
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
748
return record_contents, rec[3]
750
def _read_record_contents(self, df, record_lines):
751
"""Read and return n lines from datafile."""
753
for i in range(record_lines):
754
r.append(df.readline())
757
def read_records_iter(self, records):
758
"""Read text records from data file and yield result.
760
Each passed record is a tuple of (version_id, pos, len) and
761
will be read in the given order. Yields (version_id,
765
class ContinuousRange:
766
def __init__(self, rec_id, pos, size):
768
self.end_pos = pos + size
769
self.versions = [(rec_id, pos, size)]
771
def add(self, rec_id, pos, size):
772
if self.end_pos != pos:
774
self.end_pos = pos + size
775
self.versions.append((rec_id, pos, size))
779
for rec_id, pos, size in self.versions:
780
yield rec_id, fp.read(size)
782
fp = self._open_file()
784
# Loop through all records and try to collect as large
785
# continuous region as possible to read.
787
record_id, pos, size = records.pop(0)
788
continuous_range = ContinuousRange(record_id, pos, size)
790
record_id, pos, size = records[0]
791
if continuous_range.add(record_id, pos, size):
795
fp.seek(continuous_range.start_pos, 0)
796
for record_id, data in continuous_range.split(fp):
797
content, digest = self._parse_record(record_id, data)
798
yield record_id, content, digest
802
def read_records(self, records):
803
"""Read records into a dictionary."""
805
for record_id, content, digest in self.read_records_iter(records):
806
components[record_id] = (content, digest)