~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Aaron Bentley
Date: 2006-04-07 22:39:47 UTC
mfrom: (1558.12.2 bzr.ab.integration)
mto: (1731.2.9 nested-trees) (2234.6.1 bzr.0.14) (2229.2.4 reserved-ids) (2323.6.9 0.15-integration) (1551.19.24 Aaron's mergeable stuff)
mto: This revision was merged to the branch mainline in revision 1669.
Revision ID: aaron.bentley@utoronto.ca-20060407223947-730e8436858d086e

Merge from bzr.ab.integration

files added:
bzrlib/tuned_gzip.py

files modified:
HACKING

bzrlib/builtins.py

bzrlib/commit.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/osutils.py

bzrlib/progress.py

bzrlib/repository.py

bzrlib/store/revision/knit.py

bzrlib/store/versioned/__init__.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_xml.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/workingtree.py

bzrlib/xml5.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

import gzip

from itertools import izip, chain

import os

import sys

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import *

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

161

return KnitContent(lines)

162

163

def parse_line_delta_iter(self, lines):

164

for result_item in self.parse_line_delta[lines]:

165

yield result_item

166

167

def parse_line_delta(self, lines, version):

164

168

"""Convert a line based delta into internal representation.

165

169

166

170

line delta is in the form of:

170

174

internal represnetation is

171

175

(start, end, count, [1..count tuples (revid, newline)])

172

176

"""

173

while lines:

174

header = lines.pop(0)

175

start, end, c = [int(n) for n in header.split(',')]

177

result = []

178

lines = iter(lines)

179

next = lines.next

180

# walk through the lines parsing.

181

for header in lines:

182

start, end, count = [int(n) for n in header.split(',')]

176

183

contents = []

177

for i in range(c):

178

origin, text = lines.pop(0).split(' ', 1)

184

remaining = count

185

while remaining:

186

origin, text = next().split(' ', 1)

187

remaining -= 1

179

188

contents.append((origin.decode('utf-8'), text))

180

yield start, end, c, contents

181

182

def parse_line_delta(self, lines, version):

183

return list(self.parse_line_delta_iter(lines))

189

result.append((start, end, count, contents))

190

return result

184

191

185

192

def lower_fulltext(self, content):

186

193

"""convert a fulltext content record into a serializable form.

192

199

def lower_line_delta(self, delta):

193

200

"""convert a delta into a serializable form.

194

201

195

See parse_line_delta_iter which this inverts.

202

See parse_line_delta which this inverts.

196

203

"""

197

204

out = []

198

205

for start, end, c, lines in delta:

487

494

The basis knit will be used to the largest extent possible

488

495

since it is assumed that accesses to it is faster.

489

496

"""

497

#profile notes:

498

# 4168 calls in 14912, 2289 internal

499

# 4168 in 9711 to read_records

500

# 52554 in 1250 to get_parents

501

# 170166 in 865 to list.append

502

490

503

# needed_revisions holds a list of (method, version_id) of

491

504

# versions that is needed to be fetched to construct the final

492

505

# version of the file.

749

762

750

763

def get_parents(self, version_id):

751

764

"""See VersionedFile.get_parents."""

752

self._check_versions_present([version_id])

753

return list(self._index.get_parents(version_id))

765

# perf notes:

766

# optimism counts!

767

# 52554 calls in 1264 872 internal down from 3674

768

try:

769

return self._index.get_parents(version_id)

770

except KeyError:

771

raise RevisionNotPresent(version_id, self.filename)

754

772

755

773

def get_parents_with_ghosts(self, version_id):

756

774

"""See VersionedFile.get_parents."""

757

self._check_versions_present([version_id])

758

return list(self._index.get_parents_with_ghosts(version_id))

775

try:

776

return self._index.get_parents_with_ghosts(version_id)

777

except KeyError:

778

raise RevisionNotPresent(version_id, self.filename)

759

779

760

780

def get_ancestry(self, versions):

761

781

"""See VersionedFile.get_ancestry."""

860

880

# only want the _history index to reference the 1st index entry

861

881

# for version_id

862

882

if version_id not in self._cache:

883

index = len(self._history)

863

884

self._history.append(version_id)

864

self._cache[version_id] = (version_id, options, pos, size, parents)

885

else:

886

index = self._cache[version_id][5]

887

self._cache[version_id] = (version_id,

888

options,

889

pos,

890

size,

891

parents,

892

index)

865

893

866

894

def __init__(self, transport, filename, mode, create=False):

867

895

_KnitComponentFile.__init__(self, transport, filename, mode)

916

944

# index entry for version_id

917

945

version_id = rec[0]

918

946

if version_id not in self._cache:

947

index = len(self._history)

919

948

self._history.append(version_id)

949

else:

950

index = self._cache[version_id][5]

920

951

self._cache[version_id] = (version_id,

921

952

rec[1].split(','),

922

953

int(rec[2]),

923

954

int(rec[3]),

924

parents)

955

parents,

956

index)

925

957

# --- self._cache_version

926

958

except NoSuchFile, e:

927

959

if mode != 'w' or not create:

1012

1044

1013

1045

def lookup(self, version_id):

1014

1046

assert version_id in self._cache

1015

return self._history.index(version_id)

1047

return self._cache[version_id][5]

1016

1048

1017

1049

def _version_list_to_index(self, versions):

1018

1050

result_list = []

1019

1051

for version in versions:

1020

1052

if version in self._cache:

1021

result_list.append(str(self._history.index(version)))

1053

# -- inlined lookup() --

1054

result_list.append(str(self._cache[version][5]))

1055

# -- end lookup () --

1022

1056

else:

1023

1057

result_list.append('.' + version.encode('utf-8'))

1024

1058

return ' '.join(result_list)

1112

1146

len(lines),

1113

1147

digest)],

1114

1148

lines,

1115

["end %s\n\n" % version_id.encode('utf-8')]))

1149

["end %s\n" % version_id.encode('utf-8')]))

1116

1150

data_file.close()

1117

1151

length= sio.tell()

1118

1152

1186

return df, rec

1153

1187

1154

1188

def _parse_record(self, version_id, data):

1189

# profiling notes:

1190

# 4168 calls in 2880 217 internal

1191

# 4168 calls to _parse_record_header in 2121

1192

# 4168 calls to readlines in 330

1155

1193

df, rec = self._parse_record_header(version_id, data)

1156

lines = int(rec[2])

1157

record_contents = self._read_record_contents(df, lines)

1158

l = df.readline()

1194

record_contents = df.readlines()

1195

l = record_contents.pop()

1196

assert len(record_contents) == int(rec[2])

1159

1197

if l.decode('utf-8') != 'end %s\n' % version_id:

1160

1198

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1161

1199

% (l, version_id))

1162

1200

df.close()

1163

1201

return record_contents, rec[3]

1164

1202

1165

def _read_record_contents(self, df, record_lines):

1166

"""Read and return n lines from datafile."""

1167

r = []

1168

for i in range(record_lines):

1169

r.append(df.readline())

1170

return r

1171

1172

1203

def read_records_iter_raw(self, records):

1173

1204

"""Read text records from data file and yield raw data.

1174

1205

1212

1243

will be read in the given order. Yields (version_id,

1213

1244

contents, digest).

1214

1245

"""

1246

# profiling notes:

1247

# 60890 calls for 4168 extractions in 5045, 683 internal.

1248

# 4168 calls to readv in 1411

1249

# 4168 calls to parse_record in 2880

1215

1250

1216

1251

needed_records = []

1217

1252

for version_id, pos, size in records:

1229

1264

self._records[record_id] = (digest, content)

1230

1265

1231

1266

for version_id, pos, size in records:

1232

yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])

1267

yield version_id, list(self._records[version_id][1]), self._records[version_id][0]

1233

1268

1234

1269

def read_records(self, records):

1235

1270

"""Read records into a dictionary."""

1363

1398

InterVersionedFile.register_optimiser(InterKnit)

1364

1399

1365

1400

1366

# make GzipFile faster:

1367

import zlib

1368

class GzipFile(gzip.GzipFile):

1369

"""Knit tuned version of GzipFile.

1370

1371

This is based on the following lsprof stats:

1372

python 2.4 stock GzipFile write:

1373

58971 0 5644.3090 2721.4730 gzip:193(write)

1374

+58971 0 1159.5530 1159.5530 +<built-in method compress>

1375

+176913 0 987.0320 987.0320 +<len>

1376

+58971 0 423.1450 423.1450 +<zlib.crc32>

1377

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

1378

StringO' objects>

1379

tuned GzipFile write:

1380

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

1381

+58971 0 1297.7620 1297.7620 +<built-in method compress>

1382

+58971 0 406.2160 406.2160 +<zlib.crc32>

1383

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

1384

StringO' objects>

1385

+58971 0 328.2670 328.2670 +<len>

1386

1387

1388

Yes, its only 1.6 seconds, but they add up.

1389

"""

1390

1391

def write(self, data):

1392

if self.mode != gzip.WRITE:

1393

import errno

1394

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

1395

1396

if self.fileobj is None:

1397

raise ValueError, "write() on closed GzipFile object"

1398

data_len = len(data)

1399

if data_len > 0:

1400

self.size = self.size + data_len

1401

self.crc = zlib.crc32(data, self.crc)

1402

self.fileobj.write( self.compress.compress(data) )

1403

self.offset += data_len

1404

1405

def writelines(self, lines):

1406

# profiling indicated a significant overhead

1407

# calling write for each line.

1408

# this batch call is a lot faster :).

1409

# (4 seconds to 1 seconds for the sample upgrades I was testing).

1410

self.write(''.join(lines))

1411

1412

1413

1401

class SequenceMatcher(difflib.SequenceMatcher):

1414

1402

"""Knit tuned sequence matcher.

1415

1403

Older »