~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Merge from bzr.ab.integration

Show diffs side-by-side

added added

removed removed

Lines of Context:
66
66
from copy import copy
67
67
from cStringIO import StringIO
68
68
import difflib
69
 
import gzip
70
69
from itertools import izip, chain
71
70
import os
72
 
 
 
71
import sys
73
72
 
74
73
import bzrlib
75
74
import bzrlib.errors as errors
76
75
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
77
76
        InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
78
77
        RevisionNotPresent, RevisionAlreadyPresent
 
78
from bzrlib.tuned_gzip import *
79
79
from bzrlib.trace import mutter
80
80
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
81
81
     sha_strings
161
161
        return KnitContent(lines)
162
162
 
163
163
    def parse_line_delta_iter(self, lines):
 
164
        for result_item in self.parse_line_delta[lines]:
 
165
            yield result_item
 
166
 
 
167
    def parse_line_delta(self, lines, version):
164
168
        """Convert a line based delta into internal representation.
165
169
 
166
170
        line delta is in the form of:
170
174
        internal represnetation is
171
175
        (start, end, count, [1..count tuples (revid, newline)])
172
176
        """
173
 
        while lines:
174
 
            header = lines.pop(0)
175
 
            start, end, c = [int(n) for n in header.split(',')]
 
177
        result = []
 
178
        lines = iter(lines)
 
179
        next = lines.next
 
180
        # walk through the lines parsing.
 
181
        for header in lines:
 
182
            start, end, count = [int(n) for n in header.split(',')]
176
183
            contents = []
177
 
            for i in range(c):
178
 
                origin, text = lines.pop(0).split(' ', 1)
 
184
            remaining = count
 
185
            while remaining:
 
186
                origin, text = next().split(' ', 1)
 
187
                remaining -= 1
179
188
                contents.append((origin.decode('utf-8'), text))
180
 
            yield start, end, c, contents
181
 
 
182
 
    def parse_line_delta(self, lines, version):
183
 
        return list(self.parse_line_delta_iter(lines))
 
189
            result.append((start, end, count, contents))
 
190
        return result
184
191
 
185
192
    def lower_fulltext(self, content):
186
193
        """convert a fulltext content record into a serializable form.
192
199
    def lower_line_delta(self, delta):
193
200
        """convert a delta into a serializable form.
194
201
 
195
 
        See parse_line_delta_iter which this inverts.
 
202
        See parse_line_delta which this inverts.
196
203
        """
197
204
        out = []
198
205
        for start, end, c, lines in delta:
487
494
        The basis knit will be used to the largest extent possible
488
495
        since it is assumed that accesses to it is faster.
489
496
        """
 
497
        #profile notes:
 
498
        # 4168 calls in 14912, 2289 internal
 
499
        # 4168 in 9711 to read_records
 
500
        # 52554 in 1250 to get_parents
 
501
        # 170166 in 865 to list.append
 
502
        
490
503
        # needed_revisions holds a list of (method, version_id) of
491
504
        # versions that is needed to be fetched to construct the final
492
505
        # version of the file.
749
762
 
750
763
    def get_parents(self, version_id):
751
764
        """See VersionedFile.get_parents."""
752
 
        self._check_versions_present([version_id])
753
 
        return list(self._index.get_parents(version_id))
 
765
        # perf notes:
 
766
        # optimism counts!
 
767
        # 52554 calls in 1264 872 internal down from 3674
 
768
        try:
 
769
            return self._index.get_parents(version_id)
 
770
        except KeyError:
 
771
            raise RevisionNotPresent(version_id, self.filename)
754
772
 
755
773
    def get_parents_with_ghosts(self, version_id):
756
774
        """See VersionedFile.get_parents."""
757
 
        self._check_versions_present([version_id])
758
 
        return list(self._index.get_parents_with_ghosts(version_id))
 
775
        try:
 
776
            return self._index.get_parents_with_ghosts(version_id)
 
777
        except KeyError:
 
778
            raise RevisionNotPresent(version_id, self.filename)
759
779
 
760
780
    def get_ancestry(self, versions):
761
781
        """See VersionedFile.get_ancestry."""
860
880
        # only want the _history index to reference the 1st index entry
861
881
        # for version_id
862
882
        if version_id not in self._cache:
 
883
            index = len(self._history)
863
884
            self._history.append(version_id)
864
 
        self._cache[version_id] = (version_id, options, pos, size, parents)
 
885
        else:
 
886
            index = self._cache[version_id][5]
 
887
        self._cache[version_id] = (version_id, 
 
888
                                   options,
 
889
                                   pos,
 
890
                                   size,
 
891
                                   parents,
 
892
                                   index)
865
893
 
866
894
    def __init__(self, transport, filename, mode, create=False):
867
895
        _KnitComponentFile.__init__(self, transport, filename, mode)
916
944
                    # index entry for version_id
917
945
                    version_id = rec[0]
918
946
                    if version_id not in self._cache:
 
947
                        index = len(self._history)
919
948
                        self._history.append(version_id)
 
949
                    else:
 
950
                        index = self._cache[version_id][5]
920
951
                    self._cache[version_id] = (version_id,
921
952
                                               rec[1].split(','),
922
953
                                               int(rec[2]),
923
954
                                               int(rec[3]),
924
 
                                               parents)
 
955
                                               parents,
 
956
                                               index)
925
957
                    # --- self._cache_version 
926
958
            except NoSuchFile, e:
927
959
                if mode != 'w' or not create:
1012
1044
 
1013
1045
    def lookup(self, version_id):
1014
1046
        assert version_id in self._cache
1015
 
        return self._history.index(version_id)
 
1047
        return self._cache[version_id][5]
1016
1048
 
1017
1049
    def _version_list_to_index(self, versions):
1018
1050
        result_list = []
1019
1051
        for version in versions:
1020
1052
            if version in self._cache:
1021
 
                result_list.append(str(self._history.index(version)))
 
1053
                # -- inlined lookup() --
 
1054
                result_list.append(str(self._cache[version][5]))
 
1055
                # -- end lookup () --
1022
1056
            else:
1023
1057
                result_list.append('.' + version.encode('utf-8'))
1024
1058
        return ' '.join(result_list)
1112
1146
                                     len(lines),
1113
1147
                                     digest)],
1114
1148
            lines,
1115
 
            ["end %s\n\n" % version_id.encode('utf-8')]))
 
1149
            ["end %s\n" % version_id.encode('utf-8')]))
1116
1150
        data_file.close()
1117
1151
        length= sio.tell()
1118
1152
 
1152
1186
        return df, rec
1153
1187
 
1154
1188
    def _parse_record(self, version_id, data):
 
1189
        # profiling notes:
 
1190
        # 4168 calls in 2880 217 internal
 
1191
        # 4168 calls to _parse_record_header in 2121
 
1192
        # 4168 calls to readlines in 330
1155
1193
        df, rec = self._parse_record_header(version_id, data)
1156
 
        lines = int(rec[2])
1157
 
        record_contents = self._read_record_contents(df, lines)
1158
 
        l = df.readline()
 
1194
        record_contents = df.readlines()
 
1195
        l = record_contents.pop()
 
1196
        assert len(record_contents) == int(rec[2])
1159
1197
        if l.decode('utf-8') != 'end %s\n' % version_id:
1160
1198
            raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r' 
1161
1199
                        % (l, version_id))
1162
1200
        df.close()
1163
1201
        return record_contents, rec[3]
1164
1202
 
1165
 
    def _read_record_contents(self, df, record_lines):
1166
 
        """Read and return n lines from datafile."""
1167
 
        r = []
1168
 
        for i in range(record_lines):
1169
 
            r.append(df.readline())
1170
 
        return r
1171
 
 
1172
1203
    def read_records_iter_raw(self, records):
1173
1204
        """Read text records from data file and yield raw data.
1174
1205
 
1212
1243
        will be read in the given order.  Yields (version_id,
1213
1244
        contents, digest).
1214
1245
        """
 
1246
        # profiling notes:
 
1247
        # 60890  calls for 4168 extractions in 5045, 683 internal.
 
1248
        # 4168   calls to readv              in 1411
 
1249
        # 4168   calls to parse_record       in 2880
1215
1250
 
1216
1251
        needed_records = []
1217
1252
        for version_id, pos, size in records:
1229
1264
                self._records[record_id] = (digest, content)
1230
1265
    
1231
1266
        for version_id, pos, size in records:
1232
 
            yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])
 
1267
            yield version_id, list(self._records[version_id][1]), self._records[version_id][0]
1233
1268
 
1234
1269
    def read_records(self, records):
1235
1270
        """Read records into a dictionary."""
1363
1398
InterVersionedFile.register_optimiser(InterKnit)
1364
1399
 
1365
1400
 
1366
 
# make GzipFile faster:
1367
 
import zlib
1368
 
class GzipFile(gzip.GzipFile):
1369
 
    """Knit tuned version of GzipFile.
1370
 
 
1371
 
    This is based on the following lsprof stats:
1372
 
    python 2.4 stock GzipFile write:
1373
 
    58971      0   5644.3090   2721.4730   gzip:193(write)
1374
 
    +58971     0   1159.5530   1159.5530   +<built-in method compress>
1375
 
    +176913    0    987.0320    987.0320   +<len>
1376
 
    +58971     0    423.1450    423.1450   +<zlib.crc32>
1377
 
    +58971     0    353.1060    353.1060   +<method 'write' of 'cStringIO.
1378
 
                                            StringO' objects>
1379
 
    tuned GzipFile write:
1380
 
    58971      0   4477.2590   2103.1120   bzrlib.knit:1250(write)
1381
 
    +58971     0   1297.7620   1297.7620   +<built-in method compress>
1382
 
    +58971     0    406.2160    406.2160   +<zlib.crc32>
1383
 
    +58971     0    341.9020    341.9020   +<method 'write' of 'cStringIO.
1384
 
                                            StringO' objects>
1385
 
    +58971     0    328.2670    328.2670   +<len>
1386
 
 
1387
 
 
1388
 
    Yes, its only 1.6 seconds, but they add up.
1389
 
    """
1390
 
 
1391
 
    def write(self, data):
1392
 
        if self.mode != gzip.WRITE:
1393
 
            import errno
1394
 
            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
1395
 
 
1396
 
        if self.fileobj is None:
1397
 
            raise ValueError, "write() on closed GzipFile object"
1398
 
        data_len = len(data)
1399
 
        if data_len > 0:
1400
 
            self.size = self.size + data_len
1401
 
            self.crc = zlib.crc32(data, self.crc)
1402
 
            self.fileobj.write( self.compress.compress(data) )
1403
 
            self.offset += data_len
1404
 
 
1405
 
    def writelines(self, lines):
1406
 
        # profiling indicated a significant overhead 
1407
 
        # calling write for each line.
1408
 
        # this batch call is a lot faster :).
1409
 
        # (4 seconds to 1 seconds for the sample upgrades I was testing).
1410
 
        self.write(''.join(lines))
1411
 
 
1412
 
 
1413
1401
class SequenceMatcher(difflib.SequenceMatcher):
1414
1402
    """Knit tuned sequence matcher.
1415
1403