~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Aaron Bentley
Date: 2006-12-21 21:20:08 UTC
mfrom: (2212 +trunk)
mto: (2234.6.1 bzr.0.14) (2229.2.4 reserved-ids) (2323.6.9 0.15-integration) (1551.19.24 Aaron's mergeable stuff)
mto: This revision was merged to the branch mainline in revision 2214.
Revision ID: abentley@panoramicfeedback.com-20061221212008-xwcgsfwa9jyrkmna

Merge bzr.dev

files modified:
NEWS

bzrlib/builtins.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/knit.py

bzrlib/repository.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_knit.py

bzrlib/transport/chroot.py

bzrlib/transport/http/_urllib.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

errors,

patiencediff,

progress,

)

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

ui,

)

from bzrlib.errors import (

FileExists,

NoSuchFile,

KnitError,

InvalidRevisionId,

KnitCorrupt,

KnitHeaderError,

RevisionNotPresent,

RevisionAlreadyPresent,

)

from bzrlib.tuned_gzip import GzipFile

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.osutils import (

contains_whitespace,

contains_linebreaks,

sha_strings,

)

from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed

from bzrlib.tsort import topo_sort

import bzrlib.ui

659

666

660

667

def _check_versions_present(self, version_ids):

661

668

"""Check that all specified versions are present."""

662

version_ids = set(version_ids)

663

for r in list(version_ids):

664

if self._index.has_version(r):

665

version_ids.remove(r)

666

if version_ids:

667

raise RevisionNotPresent(list(version_ids)[0], self.filename)

669

self._index.check_versions_present(version_ids)

668

670

669

671

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

670

672

"""See VersionedFile.add_lines_with_ghosts()."""

928

930

versions = [versions]

929

931

if not versions:

930

932

return []

931

self._check_versions_present(versions)

932

933

return self._index.get_ancestry(versions)

933

934

935

def get_ancestry_with_ghosts(self, versions):

937

938

versions = [versions]

938

939

if not versions:

939

940

return []

940

self._check_versions_present(versions)

941

return self._index.get_ancestry_with_ghosts(versions)

942

943

#@deprecated_method(zero_eight)

1014

self._create_parent_dir = create_parent_dir

1015

self._need_to_create = False

1016

1017

def _full_path(self):

1018

"""Return the full path to this file."""

1019

return self._transport.base + self._filename

1020

1017

1021

def check_header(self, fp):

1018

1022

line = fp.readline()

1019

1023

if line == '':

1020

1024

# An empty file can actually be treated as though the file doesn't

1021

1025

# exist yet.

1022

raise errors.NoSuchFile(self._transport.base + self._filename)

1026

raise errors.NoSuchFile(self._full_path())

1023

1027

if line != self.HEADER:

1024

1028

raise KnitHeaderError(badline=line,

1025

1029

filename=self._transport.abspath(self._filename))

1073

1077

The ' :' marker is the end of record marker.

1074

1078

1075

1079

partial writes:

1076

when a write is interrupted to the index file, it will result in a line that

1077

does not end in ' :'. If the ' :' is not present at the end of a line, or at

1078

the end of the file, then the record that is missing it will be ignored by

1079

the parser.

1080

when a write is interrupted to the index file, it will result in a line

1081

that does not end in ' :'. If the ' :' is not present at the end of a line,

1082

or at the end of the file, then the record that is missing it will be

1083

ignored by the parser.

1080

1084

1081

1085

When writing new records to the index file, the data is preceded by '\n'

1082

1086

to ensure that records always start on new lines even if the last write was

1091

1095

1092

1096

def _cache_version(self, version_id, options, pos, size, parents):

1093

1097

"""Cache a version record in the history array and index cache.

1094

1095

This is inlined into __init__ for performance. KEEP IN SYNC.

1098

1099

This is inlined into _load_data for performance. KEEP IN SYNC.

1096

1100

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1097

1101

indexes).

1098

1102

"""

1123

1127

# in the knit.

1124

1128

self._history = []

1125

1129

decode_utf8 = cache_utf8.decode

1126

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1130

pb = ui.ui_factory.nested_progress_bar()

1127

1131

try:

1128

count = 0

1129

total = 1

1132

pb.update('read knit index', 0, 1)

1130

1133

try:

1131

pb.update('read knit index', count, total)

1132

1134

fp = self._transport.get(self._filename)

1133

1135

try:

1134

self.check_header(fp)

1135

# readlines reads the whole file at once:

1136

# bad for transports like http, good for local disk

1137

# we save 60 ms doing this one change (

1138

# from calling readline each time to calling

1139

# readlines once.

1140

# probably what we want for nice behaviour on

1141

# http is a incremental readlines that yields, or

1142

# a check for local vs non local indexes,

1143

for l in fp.readlines():

1144

rec = l.split()

1145

if len(rec) < 5 or rec[-1] != ':':

1146

# corrupt line.

1147

# FIXME: in the future we should determine if its a

1148

# short write - and ignore it

1149

# or a different failure, and raise. RBC 20060407

1150

continue

1151

count += 1

1152

total += 1

1153

#pb.update('read knit index', count, total)

1154

# See self._parse_parents

1155

parents = []

1156

for value in rec[4:-1]:

1157

if '.' == value[0]:

1158

# uncompressed reference

1159

parents.append(decode_utf8(value[1:]))

1160

else:

1161

# this is 15/4000ms faster than isinstance,

1162

# (in lsprof)

1163

# this function is called thousands of times a

1164

# second so small variations add up.

1165

assert value.__class__ is str

1166

parents.append(self._history[int(value)])

1167

# end self._parse_parents

1168

# self._cache_version(decode_utf8(rec[0]),

1169

# rec[1].split(','),

1170

# int(rec[2]),

1171

# int(rec[3]),

1172

# parents)

1173

# --- self._cache_version

1174

# only want the _history index to reference the 1st

1175

# index entry for version_id

1176

version_id = decode_utf8(rec[0])

1177

if version_id not in self._cache:

1178

index = len(self._history)

1179

self._history.append(version_id)

1180

else:

1181

index = self._cache[version_id][5]

1182

self._cache[version_id] = (version_id,

1183

rec[1].split(','),

1184

int(rec[2]),

1185

int(rec[3]),

1186

parents,

1187

index)

1188

# --- self._cache_version

1136

# _load_data may raise NoSuchFile if the target knit is

1137

# completely empty.

1138

self._load_data(fp)

1189

1139

finally:

1190

1140

fp.close()

1191

except NoSuchFile, e:

1141

except NoSuchFile:

1192

1142

if mode != 'w' or not create:

1193

1143

raise

1194

if delay_create:

1144

elif delay_create:

1195

1145

self._need_to_create = True

1196

1146

else:

1197

self._transport.put_bytes_non_atomic(self._filename,

1198

self.HEADER, mode=self._file_mode)

1199

1147

self._transport.put_bytes_non_atomic(

1148

self._filename, self.HEADER, mode=self._file_mode)

1200

1149

finally:

1201

pb.update('read knit index', total, total)

1150

pb.update('read knit index', 1, 1)

1202

1151

pb.finished()

1203

1152

1204

def _parse_parents(self, compressed_parents):

1205

"""convert a list of string parent values into version ids.

1206

1207

ints are looked up in the index.

1208

.FOO values are ghosts and converted in to FOO.

1209

1210

NOTE: the function is retained here for clarity, and for possible

1211

use in partial index reads. However bulk processing now has

1212

it inlined in __init__ for inner-loop optimisation.

1213

"""

1214

result = []

1215

for value in compressed_parents:

1216

if value[-1] == '.':

1217

# uncompressed reference

1218

result.append(cache_utf8.decode_utf8(value[1:]))

1153

def _load_data(self, fp):

1154

cache = self._cache

1155

history = self._history

1156

decode_utf8 = cache_utf8.decode

1157

1158

self.check_header(fp)

1159

# readlines reads the whole file at once:

1160

# bad for transports like http, good for local disk

1161

# we save 60 ms doing this one change (

1162

# from calling readline each time to calling

1163

# readlines once.

1164

# probably what we want for nice behaviour on

1165

# http is a incremental readlines that yields, or

1166

# a check for local vs non local indexes,

1167

history_top = len(history) - 1

1168

for line in fp.readlines():

1169

rec = line.split()

1170

if len(rec) < 5 or rec[-1] != ':':

1171

# corrupt line.

1172

# FIXME: in the future we should determine if its a

1173

# short write - and ignore it

1174

# or a different failure, and raise. RBC 20060407

1175

continue

1176

1177

parents = []

1178

for value in rec[4:-1]:

1179

if value[0] == '.':

1180

# uncompressed reference

1181

parents.append(decode_utf8(value[1:]))

1182

else:

1183

parents.append(history[int(value)])

1184

1185

version_id, options, pos, size = rec[:4]

1186

version_id = decode_utf8(version_id)

1187

1188

# See self._cache_version

1189

# only want the _history index to reference the 1st

1190

# index entry for version_id

1191

if version_id not in cache:

1192

history_top += 1

1193

index = history_top

1194

history.append(version_id)

1219

1195

else:

1220

# this is 15/4000ms faster than isinstance,

1221

# this function is called thousands of times a

1222

# second so small variations add up.

1223

assert value.__class__ is str

1224

result.append(self._history[int(value)])

1225

return result

1196

index = cache[version_id][5]

1197

cache[version_id] = (version_id,

1198

options.split(','),

1199

int(pos),

1200

int(size),

1201

parents,

1202

index)

1203

# end self._cache_version

1226

1204

1227

1205

def get_graph(self):

1228

graph = []

1229

for version_id, index in self._cache.iteritems():

1230

graph.append((version_id, index[4]))

1231

return graph

1206

return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]

1232

1207

1233

1208

def get_ancestry(self, versions):

1234

1209

"""See VersionedFile.get_ancestry."""

1235

1210

# get a graph of all the mentioned versions:

1236

1211

graph = {}

1237

1212

pending = set(versions)

1238

while len(pending):

1213

cache = self._cache

1214

while pending:

1239

1215

version = pending.pop()

1240

parents = self._cache[version][4]

1241

# got the parents ok

1242

1216

# trim ghosts

1243

parents = [parent for parent in parents if parent in self._cache]

1244

for parent in parents:

1245

# if not completed and not a ghost

1246

if parent not in graph:

1247

pending.add(parent)

1217

try:

1218

parents = [p for p in cache[version][4] if p in cache]

1219

except KeyError:

1220

raise RevisionNotPresent(version, self._filename)

1221

# if not completed and not a ghost

1222

pending.update([p for p in parents if p not in graph])

1248

1223

graph[version] = parents

1249

1224

return topo_sort(graph.items())

1250

1225

1251

1226

def get_ancestry_with_ghosts(self, versions):

1252

1227

"""See VersionedFile.get_ancestry_with_ghosts."""

1253

1228

# get a graph of all the mentioned versions:

1229

self.check_versions_present(versions)

1230

cache = self._cache

1254

1231

graph = {}

1255

1232

pending = set(versions)

1256

while len(pending):

1233

while pending:

1257

1234

version = pending.pop()

1258

1235

try:

1259

parents = self._cache[version][4]

1236

parents = cache[version][4]

1260

1237

except KeyError:

1261

1238

# ghost, fake it

1262

1239

graph[version] = []

1263

pass

1264

1240

else:

1265

# got the parents ok

1266

for parent in parents:

1267

if parent not in graph:

1268

pending.add(parent)

1241

# if not completed

1242

pending.update([p for p in parents if p not in graph])

1269

1243

graph[version] = parents

1270

1244

return topo_sort(graph.items())

1271

1245

1287

1261

def _version_list_to_index(self, versions):

1288

1262

encode_utf8 = cache_utf8.encode

1289

1263

result_list = []

1264

cache = self._cache

1290

1265

for version in versions:

1291

if version in self._cache:

1266

if version in cache:

1292

1267

# -- inlined lookup() --

1293

result_list.append(str(self._cache[version][5]))

1268

result_list.append(str(cache[version][5]))

1294

1269

# -- end lookup () --

1295

1270

else:

1296

1271

result_list.append('.' + encode_utf8(version))

1342

1317

1343

1318

def has_version(self, version_id):

1344

1319

"""True if the version is in the index."""

1345

return (version_id in self._cache)

1320

return version_id in self._cache

1346

1321

1347

1322

def get_position(self, version_id):

1348

1323

"""Return data position and size of specified version."""

1349

return (self._cache[version_id][2], \

1350

self._cache[version_id][3])

1324

entry = self._cache[version_id]

1325

return entry[2], entry[3]

1351

1326

1352

1327

def get_method(self, version_id):

1353

1328

"""Return compression method of specified version."""

1355

1330

if 'fulltext' in options:

1356

1331

return 'fulltext'

1357

1332

else:

1358

assert 'line-delta' in options

1333

if 'line-delta' not in options:

1334

raise errors.KnitIndexUnknownMethod(self._full_path(), options)

1359

1335

return 'line-delta'

1360

1336

1361

1337

def get_options(self, version_id):

1372

1348

1373

1349

def check_versions_present(self, version_ids):

1374

1350

"""Check that all specified versions are present."""

1375

version_ids = set(version_ids)

1376

for version_id in list(version_ids):

1377

if version_id in self._cache:

1378

version_ids.remove(version_id)

1379

if version_ids:

1380

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1351

cache = self._cache

1352

for version_id in version_ids:

1353

if version_id not in cache:

1354

raise RevisionNotPresent(version_id, self._filename)

1381

1355

1382

1356

1383

1357

class _KnitData(_KnitComponentFile):

1482

1456

as (stream, header_record)

1483

1457

"""

1484

1458

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1485

rec = df.readline().split()

1459

rec = self._check_header(version_id, df.readline())

1460

return df, rec

1461

1462

def _check_header(self, version_id, line):

1463

rec = line.split()

1486

1464

if len(rec) != 4:

1487

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1465

raise KnitCorrupt(self._filename,

1466

'unexpected number of elements in record header')

1488

1467

if cache_utf8.decode(rec[1]) != version_id:

1489

raise KnitCorrupt(self._filename,

1490

'unexpected version, wanted %r, got %r' % (

1491

version_id, rec[1]))

1492

return df, rec

1468

raise KnitCorrupt(self._filename,

1469

'unexpected version, wanted %r, got %r'

1470

% (version_id, rec[1]))

1471

return rec

1493

1472

1494

1473

def _parse_record(self, version_id, data):

1495

1474

# profiling notes:

1496

1475

# 4168 calls in 2880 217 internal

1497

1476

# 4168 calls to _parse_record_header in 2121

1498

1477

# 4168 calls to readlines in 330

1499

df, rec = self._parse_record_header(version_id, data)

1478

df = GzipFile(mode='rb', fileobj=StringIO(data))

1479

1500

1480

record_contents = df.readlines()

1501

l = record_contents.pop()

1481

header = record_contents.pop(0)

1482

rec = self._check_header(version_id, header)

1483

1484

last_line = record_contents.pop()

1502

1485

assert len(record_contents) == int(rec[2])

1503

if l != 'end %s\n' % cache_utf8.encode(version_id):

1504

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1505

% (l, version_id))

1486

if last_line != 'end %s\n' % rec[1]:

1487

raise KnitCorrupt(self._filename,

1488

'unexpected version end line %r, wanted %r'

1489

% (last_line, version_id))

1506

1490

df.close()

1507

1491

return record_contents, rec[3]

1508

1492

1526

1510

in records]

1527

1511

1528

1512

raw_records = self._transport.readv(self._filename, needed_offsets)

1529

1530

1513

1531

1514

for version_id, pos, size in records:

1532

1515

if version_id in self._cache:

1622

1605

if not version_ids:

1623

1606

return 0

1624

1607

1625

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1608

pb = ui.ui_factory.nested_progress_bar()

1626

1609

try:

1627

1610

version_ids = list(version_ids)

1628

1611

if None in version_ids:

1739

1722

if not version_ids:

1740

1723

return 0

1741

1724

1742

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1725

pb = ui.ui_factory.nested_progress_bar()

1743

1726

try:

1744

1727

version_ids = list(version_ids)

1745

1728

1939

1922

bestsize = bestsize + 1

1940

1923

1941

1924

return besti, bestj, bestsize

1942

Older »