~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-03-08 00:20:04 UTC
mfrom: (1563.2.42 versioned-file-knit-stores)
mto: (1596.2.3 integration) (1594.3.1 versioned-file-performance)
mto: This revision was merged to the branch mainline in revision 1596.
Revision ID: robertc@robertcollins.net-20060308002004-a9950b2fb75d14de

Merge in knit repository use of knits - still not a stable format, but can be experimented with.

files added:
bzrlib/inter.py

bzrlib/knit.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/versioned

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_reweave.py.moved

bzrlib/tests/test_versionedfile.py

bzrlib/versionedfile.py

files removed:
bzrlib/tests/test_reweave.py

files renamed:
bzrlib/store/weave.py => bzrlib/store/versioned/__init__.py

files modified:
BRANCH.TODO

bzrlib/annotate.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commit.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/lockable_files.py

bzrlib/merge.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/sign_my_commits.py

bzrlib/store/__init__.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_store.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_weave.py

bzrlib/transactions.py

bzrlib/transport/__init__.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/weave.py

bzrlib/weavefile.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from cStringIO import StringIO

import difflib

from difflib import SequenceMatcher

from gzip import GzipFile

import os

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.tsort import topo_sort

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accomodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

# TODO: atomically append data, then measure backwards from the cursor

# position after writing to work out where it was located. we may need to

# bypass python file buffering.

DATA_SUFFIX = '.knit'

INDEX_SUFFIX = '.kndx'

100

class KnitContent(object):

101

"""Content of a knit version to which deltas can be applied."""

102

103

def __init__(self, lines):

104

self._lines = lines

105

106

def annotate_iter(self):

107

"""Yield tuples of (origin, text) for each content line."""

108

for origin, text in self._lines:

109

yield origin, text

110

111

def annotate(self):

112

"""Return a list of (origin, text) tuples."""

113

return list(self.annotate_iter())

114

115

def apply_delta(self, delta):

116

"""Apply delta to this content."""

117

offset = 0

118

for start, end, count, lines in delta:

119

self._lines[offset+start:offset+end] = lines

120

offset = offset + (start - end) + count

121

122

def line_delta_iter(self, new_lines):

123

"""Generate line-based delta from new_lines to this content."""

124

new_texts = [text for origin, text in new_lines._lines]

125

old_texts = [text for origin, text in self._lines]

126

s = difflib.SequenceMatcher(None, old_texts, new_texts)

127

for op in s.get_opcodes():

128

if op[0] == 'equal':

129

continue

130

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

131

132

def line_delta(self, new_lines):

133

return list(self.line_delta_iter(new_lines))

134

135

def text(self):

136

return [text for origin, text in self._lines]

137

138

139

class _KnitFactory(object):

140

"""Base factory for creating content objects."""

141

142

def make(self, lines, version):

143

num_lines = len(lines)

144

return KnitContent(zip([version] * num_lines, lines))

145

146

147

class KnitAnnotateFactory(_KnitFactory):

148

"""Factory for creating annotated Content objects."""

149

150

annotated = True

151

152

def parse_fulltext(self, content, version):

153

lines = []

154

for line in content:

155

origin, text = line.split(' ', 1)

156

lines.append((int(origin), text))

157

return KnitContent(lines)

158

159

def parse_line_delta_iter(self, lines):

160

while lines:

161

header = lines.pop(0)

162

start, end, c = [int(n) for n in header.split(',')]

163

contents = []

164

for i in range(c):

165

origin, text = lines.pop(0).split(' ', 1)

166

contents.append((int(origin), text))

167

yield start, end, c, contents

168

169

def parse_line_delta(self, lines, version):

170

return list(self.parse_line_delta_iter(lines))

171

172

def lower_fulltext(self, content):

173

return ['%d %s' % (o, t) for o, t in content._lines]

174

175

def lower_line_delta(self, delta):

176

out = []

177

for start, end, c, lines in delta:

178

out.append('%d,%d,%d\n' % (start, end, c))

179

for origin, text in lines:

180

out.append('%d %s' % (origin, text))

181

return out

182

183

184

class KnitPlainFactory(_KnitFactory):

185

"""Factory for creating plain Content objects."""

186

187

annotated = False

188

189

def parse_fulltext(self, content, version):

190

return self.make(content, version)

191

192

def parse_line_delta_iter(self, lines, version):

193

while lines:

194

header = lines.pop(0)

195

start, end, c = [int(n) for n in header.split(',')]

196

yield start, end, c, zip([version] * c, lines[:c])

197

del lines[:c]

198

199

def parse_line_delta(self, lines, version):

200

return list(self.parse_line_delta_iter(lines, version))

201

202

def lower_fulltext(self, content):

203

return content.text()

204

205

def lower_line_delta(self, delta):

206

out = []

207

for start, end, c, lines in delta:

208

out.append('%d,%d,%d\n' % (start, end, c))

209

out.extend([text for origin, text in lines])

210

return out

211

212

213

def make_empty_knit(transport, relpath):

214

"""Construct a empty knit at the specified location."""

215

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

216

k._data._open_file()

217

218

219

class KnitVersionedFile(VersionedFile):

220

"""Weave-like structure with faster random access.

221

222

A knit stores a number of texts and a summary of the relationships

223

between them. Texts are identified by a string version-id. Texts

224

are normally stored and retrieved as a series of lines, but can

225

also be passed as single strings.

226

227

Lines are stored with the trailing newline (if any) included, to

228

avoid special cases for files with no final newline. Lines are

229

composed of 8-bit characters, not unicode. The combination of

230

these approaches should mean any 'binary' file can be safely

231

stored and retrieved.

232

"""

233

234

def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,

235

basis_knit=None, delta=True, create=False):

236

"""Construct a knit at location specified by relpath.

237

238

:param create: If not True, only open an existing knit.

239

"""

240

if access_mode is None:

241

access_mode = 'w'

242

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

243

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

244

type(basis_knit)

245

246

self.transport = transport

247

self.filename = relpath

248

self.basis_knit = basis_knit

249

self.factory = factory or KnitAnnotateFactory()

250

self.writable = (access_mode == 'w')

251

self.delta = delta

252

253

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

254

access_mode, create=create)

255

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

256

access_mode, create=not len(self.versions()))

257

258

def copy_to(self, name, transport):

259

"""See VersionedFile.copy_to()."""

260

# copy the current index to a temp index to avoid racing with local

261

# writes

262

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))

263

# copy the data file

264

transport.put(name + DATA_SUFFIX, self._data._open_file())

265

# rename the copied index into place

266

transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

267

268

def create_empty(self, name, transport, mode=None):

269

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

270

271

@staticmethod

272

def get_suffixes():

273

"""See VersionedFile.get_suffixes()."""

274

return [DATA_SUFFIX, INDEX_SUFFIX]

275

276

def versions(self):

277

"""See VersionedFile.versions."""

278

return self._index.get_versions()

279

280

def has_version(self, version_id):

281

"""See VersionedFile.has_version."""

282

return self._index.has_version(version_id)

283

284

__contains__ = has_version

285

286

def _merge_annotations(self, content, parents):

287

"""Merge annotations for content. This is done by comparing

288

the annotations based on changed to the text."""

289

for parent_id in parents:

290

merge_content = self._get_content(parent_id)

291

seq = SequenceMatcher(None, merge_content.text(), content.text())

292

for i, j, n in seq.get_matching_blocks():

293

if n == 0:

294

continue

295

content._lines[j:j+n] = merge_content._lines[i:i+n]

296

297

def _get_components(self, version_id):

298

"""Return a list of (version_id, method, data) tuples that

299

makes up version specified by version_id of the knit.

300

301

The components should be applied in the order of the returned

302

list.

303

304

The basis knit will be used to the largest extent possible

305

since it is assumed that accesses to it is faster.

306

"""

307

# needed_revisions holds a list of (method, version_id) of

308

# versions that is needed to be fetched to construct the final

309

# version of the file.

310

311

# basis_revisions is a list of versions that needs to be

312

# fetched but exists in the basis knit.

313

314

basis = self.basis_knit

315

needed_versions = []

316

basis_versions = []

317

cursor = version_id

318

319

while 1:

320

picked_knit = self

321

if basis and basis._index.has_version(cursor):

322

picked_knit = basis

323

basis_versions.append(cursor)

324

method = picked_knit._index.get_method(cursor)

325

needed_versions.append((method, cursor))

326

if method == 'fulltext':

327

break

328

cursor = picked_knit.get_parents(cursor)[0]

329

330

components = {}

331

if basis_versions:

332

records = []

333

for comp_id in basis_versions:

334

data_pos, data_size = basis._index.get_data_position(comp_id)

335

records.append((piece_id, data_pos, data_size))

336

components.update(basis._data.read_records(records))

337

338

records = []

339

for comp_id in [vid for method, vid in needed_versions

340

if vid not in basis_versions]:

341

data_pos, data_size = self._index.get_position(comp_id)

342

records.append((comp_id, data_pos, data_size))

343

components.update(self._data.read_records(records))

344

345

# get_data_records returns a mapping with the version id as

346

# index and the value as data. The order the components need

347

# to be applied is held by needed_versions (reversed).

348

out = []

349

for method, comp_id in reversed(needed_versions):

350

out.append((comp_id, method, components[comp_id]))

351

352

return out

353

354

def _get_content(self, version_id):

355

"""Returns a content object that makes up the specified

356

version."""

357

if not self.has_version(version_id):

358

raise RevisionNotPresent(version_id, self.filename)

359

360

if self.basis_knit and version_id in self.basis_knit:

361

return self.basis_knit._get_content(version_id)

362

363

content = None

364

components = self._get_components(version_id)

365

for component_id, method, (data, digest) in components:

366

version_idx = self._index.lookup(component_id)

367

if method == 'fulltext':

368

assert content is None

369

content = self.factory.parse_fulltext(data, version_idx)

370

elif method == 'line-delta':

371

delta = self.factory.parse_line_delta(data, version_idx)

372

content.apply_delta(delta)

373

374

if 'no-eol' in self._index.get_options(version_id):

375

line = content._lines[-1][1].rstrip('\n')

376

content._lines[-1] = (content._lines[-1][0], line)

377

378

if sha_strings(content.text()) != digest:

379

raise KnitCorrupt(self.filename, 'sha-1 does not match')

380

381

return content

382

383

def _check_versions_present(self, version_ids):

384

"""Check that all specified versions are present."""

385

version_ids = set(version_ids)

386

for r in list(version_ids):

387

if self._index.has_version(r):

388

version_ids.remove(r)

389

if version_ids:

390

raise RevisionNotPresent(list(version_ids)[0], self.filename)

391

392

def add_lines(self, version_id, parents, lines):

393

"""See VersionedFile.add_lines."""

394

assert self.writable, "knit is not opened for write"

395

### FIXME escape. RBC 20060228

396

if contains_whitespace(version_id):

397

raise InvalidRevisionId(version_id)

398

if self.has_version(version_id):

399

raise RevisionAlreadyPresent(version_id, self.filename)

400

401

if True or __debug__:

402

for l in lines:

403

assert '\n' not in l[:-1]

404

405

self._check_versions_present(parents)

406

return self._add(version_id, lines[:], parents, self.delta)

407

408

def _add(self, version_id, lines, parents, delta):

409

"""Add a set of lines on top of version specified by parents.

410

411

If delta is true, compress the text as a line-delta against

412

the first parent.

413

"""

414

if delta and not parents:

415

delta = False

416

417

digest = sha_strings(lines)

418

options = []

419

if lines:

420

if lines[-1][-1] != '\n':

421

options.append('no-eol')

422

lines[-1] = lines[-1] + '\n'

423

424

lines = self.factory.make(lines, len(self._index))

425

if self.factory.annotated and len(parents) > 0:

426

# Merge annotations from parent texts if so is needed.

427

self._merge_annotations(lines, parents)

428

429

if parents and delta:

430

# To speed the extract of texts the delta chain is limited

431

# to a fixed number of deltas. This should minimize both

432

# I/O and the time spend applying deltas.

433

count = 0

434

delta_parents = parents

435

while count < 25:

436

parent = delta_parents[0]

437

method = self._index.get_method(parent)

438

if method == 'fulltext':

439

break

440

delta_parents = self._index.get_parents(parent)

441

count = count + 1

442

if method == 'line-delta':

443

delta = False

444

445

if delta:

446

options.append('line-delta')

447

content = self._get_content(parents[0])

448

delta_hunks = content.line_delta(lines)

449

store_lines = self.factory.lower_line_delta(delta_hunks)

450

else:

451

options.append('fulltext')

452

store_lines = self.factory.lower_fulltext(lines)

453

454

where, size = self._data.add_record(version_id, digest, store_lines)

455

self._index.add_version(version_id, options, where, size, parents)

456

457

def check(self, progress_bar=None):

458

"""See VersionedFile.check()."""

459

460

def clone_text(self, new_version_id, old_version_id, parents):

461

"""See VersionedFile.clone_text()."""

462

# FIXME RBC 20060228 make fast by only inserting an index with null delta.

463

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

464

465

def get_lines(self, version_id):

466

"""See VersionedFile.get_lines()."""

467

return self._get_content(version_id).text()

468

469

def num_versions(self):

470

"""See VersionedFile.num_versions()."""

471

return self._index.num_versions()

472

473

__len__ = num_versions

474

475

def annotate_iter(self, version_id):

476

"""See VersionedFile.annotate_iter."""

477

content = self._get_content(version_id)

478

for origin, text in content.annotate_iter():

479

yield self._index.idx_to_name(origin), text

480

481

def get_parents(self, version_id):

482

"""See VersionedFile.get_parents."""

483

self._check_versions_present([version_id])

484

return list(self._index.get_parents(version_id))

485

486

def get_ancestry(self, versions):

487

"""See VersionedFile.get_ancestry."""

488

if isinstance(versions, basestring):

489

versions = [versions]

490

if not versions:

491

return []

492

self._check_versions_present(versions)

493

return self._index.get_ancestry(versions)

494

495

def _reannotate_line_delta(self, other, lines, new_version_id,

496

new_version_idx):

497

"""Re-annotate line-delta and return new delta."""

498

new_delta = []

499

for start, end, count, contents \

500

in self.factory.parse_line_delta_iter(lines):

501

new_lines = []

502

for origin, line in contents:

503

old_version_id = other._index.idx_to_name(origin)

504

if old_version_id == new_version_id:

505

idx = new_version_idx

506

else:

507

idx = self._index.lookup(old_version_id)

508

new_lines.append((idx, line))

509

new_delta.append((start, end, count, new_lines))

510

511

return self.factory.lower_line_delta(new_delta)

512

513

def _reannotate_fulltext(self, other, lines, new_version_id,

514

new_version_idx):

515

"""Re-annotate fulltext and return new version."""

516

content = self.factory.parse_fulltext(lines, new_version_idx)

517

new_lines = []

518

for origin, line in content.annotate_iter():

519

old_version_id = other._index.idx_to_name(origin)

520

if old_version_id == new_version_id:

521

idx = new_version_idx

522

else:

523

idx = self._index.lookup(old_version_id)

524

new_lines.append((idx, line))

525

526

return self.factory.lower_fulltext(KnitContent(new_lines))

527

528

def walk(self, version_ids):

529

"""See VersionedFile.walk."""

530

# We take the short path here, and extract all relevant texts

531

# and put them in a weave and let that do all the work. Far

532

# from optimal, but is much simpler.

533

# FIXME RB 20060228 this really is inefficient!

534

from bzrlib.weave import Weave

535

536

w = Weave(self.filename)

537

ancestry = self.get_ancestry(version_ids)

538

sorted_graph = topo_sort(self._index.get_graph())

539

version_list = [vid for vid in sorted_graph if vid in ancestry]

540

541

for version_id in version_list:

542

lines = self.get_lines(version_id)

543

w.add_lines(version_id, self.get_parents(version_id), lines)

544

545

for lineno, insert_id, dset, line in w.walk(version_ids):

546

yield lineno, insert_id, dset, line

547

548

549

class _KnitComponentFile(object):

550

"""One of the files used to implement a knit database"""

551

552

def __init__(self, transport, filename, mode):

553

self._transport = transport

554

self._filename = filename

555

self._mode = mode

556

557

def write_header(self):

558

old_len = self._transport.append(self._filename, StringIO(self.HEADER))

559

if old_len != 0:

560

raise KnitCorrupt(self._filename, 'misaligned after writing header')

561

562

def check_header(self, fp):

563

line = fp.read(len(self.HEADER))

564

if line != self.HEADER:

565

raise KnitHeaderError(badline=line)

566

567

def commit(self):

568

"""Commit is a nop."""

569

570

def __repr__(self):

571

return '%s(%s)' % (self.__class__.__name__, self._filename)

572

573

574

class _KnitIndex(_KnitComponentFile):

575

"""Manages knit index file.

576

577

The index is already kept in memory and read on startup, to enable

578

fast lookups of revision information. The cursor of the index

579

file is always pointing to the end, making it easy to append

580

entries.

581

582

_cache is a cache for fast mapping from version id to a Index

583

object.

584

585

_history is a cache for fast mapping from indexes to version ids.

586

587

The index data format is dictionary compressed when it comes to

588

parent references; a index entry may only have parents that with a

589

lover index number. As a result, the index is topological sorted.

590

591

Duplicate entries may be written to the index for a single version id

592

if this is done then the latter one completely replaces the former:

593

this allows updates to correct version and parent information.

594

Note that the two entries may share the delta, and that successive

595

annotations and references MUST point to the first entry.

596

"""

597

598

HEADER = "# bzr knit index 7\n"

599

600

def _cache_version(self, version_id, options, pos, size, parents):

601

val = (version_id, options, pos, size, parents)

602

self._cache[version_id] = val

603

if not version_id in self._history:

604

self._history.append(version_id)

605

606

def _iter_index(self, fp):

607

lines = fp.read()

608

for l in lines.splitlines(False):

609

yield l.split()

610

611

def __init__(self, transport, filename, mode, create=False):

612

_KnitComponentFile.__init__(self, transport, filename, mode)

613

self._cache = {}

614

# position in _history is the 'official' index for a revision

615

# but the values may have come from a newer entry.

616

# so - wc -l of a knit index is != the number of uniqe names

617

# in the weave.

618

self._history = []

619

try:

620

fp = self._transport.get(self._filename)

621

self.check_header(fp)

622

for rec in self._iter_index(fp):

623

self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),

624

[self._history[int(i)] for i in rec[4:]])

625

except NoSuchFile, e:

626

if mode != 'w' or not create:

627

raise

628

self.write_header()

629

630

def get_graph(self):

631

graph = []

632

for version_id, index in self._cache.iteritems():

633

graph.append((version_id, index[4]))

634

return graph

635

636

def get_ancestry(self, versions):

637

"""See VersionedFile.get_ancestry."""

638

# get a graph of all the mentioned versions:

639

graph = {}

640

pending = set(versions)

641

while len(pending):

642

version = pending.pop()

643

parents = self._cache[version][4]

644

for parent in parents:

645

if parent not in graph:

646

pending.add(parent)

647

graph[version] = parents

648

return topo_sort(graph.items())

649

650

def num_versions(self):

651

return len(self._history)

652

653

__len__ = num_versions

654

655

def get_versions(self):

656

return self._history

657

658

def idx_to_name(self, idx):

659

return self._history[idx]

660

661

def lookup(self, version_id):

662

assert version_id in self._cache

663

return self._history.index(version_id)

664

665

def add_version(self, version_id, options, pos, size, parents):

666

"""Add a version record to the index."""

667

self._cache_version(version_id, options, pos, size, parents)

668

669

content = "%s %s %s %s %s\n" % (version_id,

670

','.join(options),

671

pos,

672

size,

673

' '.join([str(self.lookup(vid)) for

674

vid in parents]))

675

self._transport.append(self._filename, StringIO(content))

676

677

def has_version(self, version_id):

678

"""True if the version is in the index."""

679

return self._cache.has_key(version_id)

680

681

def get_position(self, version_id):

682

"""Return data position and size of specified version."""

683

return (self._cache[version_id][2], \

684

self._cache[version_id][3])

685

686

def get_method(self, version_id):

687

"""Return compression method of specified version."""

688

options = self._cache[version_id][1]

689

if 'fulltext' in options:

690

return 'fulltext'

691

else:

692

assert 'line-delta' in options

693

return 'line-delta'

694

695

def get_options(self, version_id):

696

return self._cache[version_id][1]

697

698

def get_parents(self, version_id):

699

"""Return parents of specified version."""

700

return self._cache[version_id][4]

701

702

def check_versions_present(self, version_ids):

703

"""Check that all specified versions are present."""

704

version_ids = set(version_ids)

705

for version_id in list(version_ids):

706

if version_id in self._cache:

707

version_ids.remove(version_id)

708

if version_ids:

709

raise RevisionNotPresent(list(version_ids)[0], self.filename)

710

711

712

class _KnitData(_KnitComponentFile):

713

"""Contents of the knit data file"""

714

715

HEADER = "# bzr knit data 7\n"

716

717

def __init__(self, transport, filename, mode, create=False):

718

_KnitComponentFile.__init__(self, transport, filename, mode)

719

self._file = None

720

self._checked = False

721

if create:

722

self._transport.put(self._filename, StringIO(''))

723

724

def _open_file(self):

725

if self._file is None:

726

try:

727

self._file = self._transport.get(self._filename)

728

except NoSuchFile:

729

pass

730

return self._file

731

732

def add_record(self, version_id, digest, lines):

733

"""Write new text record to disk. Returns the position in the

734

file where it was written."""

735

sio = StringIO()

736

data_file = GzipFile(None, mode='wb', fileobj=sio)

737

print >>data_file, "version %s %d %s" % (version_id, len(lines), digest)

738

data_file.writelines(lines)

739

print >>data_file, "end %s\n" % version_id

740

data_file.close()

741

742

content = sio.getvalue()

743

start_pos = self._transport.append(self._filename, StringIO(content))

744

return start_pos, len(content)

745

746

def _parse_record(self, version_id, data):

747

df = GzipFile(mode='rb', fileobj=StringIO(data))

748

rec = df.readline().split()

749

if len(rec) != 4:

750

raise KnitCorrupt(self._filename, 'unexpected number of records')

751

if rec[1] != version_id:

752

raise KnitCorrupt(self.file.name,

753

'unexpected version, wanted %r' % version_id)

754

lines = int(rec[2])

755

record_contents = self._read_record_contents(df, lines)

756

l = df.readline()

757

if l != 'end %s\n' % version_id:

758

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

759

% (l, version_id))

760

return record_contents, rec[3]

761

762

def _read_record_contents(self, df, record_lines):

763

"""Read and return n lines from datafile."""

764

r = []

765

for i in range(record_lines):

766

r.append(df.readline())

767

return r

768

769

def read_records_iter(self, records):

770

"""Read text records from data file and yield result.

771

772

Each passed record is a tuple of (version_id, pos, len) and

773

will be read in the given order. Yields (version_id,

774

contents, digest).

775

"""

776

777

class ContinuousRange:

778

def __init__(self, rec_id, pos, size):

779

self.start_pos = pos

780

self.end_pos = pos + size

781

self.versions = [(rec_id, pos, size)]

782

783

def add(self, rec_id, pos, size):

784

if self.end_pos != pos:

785

return False

786

self.end_pos = pos + size

787

self.versions.append((rec_id, pos, size))

788

return True

789

790

def split(self, fp):

791

for rec_id, pos, size in self.versions:

792

yield rec_id, fp.read(size)

793

794

fp = self._open_file()

795

796

# Loop through all records and try to collect as large

797

# continuous region as possible to read.

798

while records:

799

record_id, pos, size = records.pop(0)

800

continuous_range = ContinuousRange(record_id, pos, size)

801

while records:

802

record_id, pos, size = records[0]

803

if continuous_range.add(record_id, pos, size):

804

del records[0]

805

else:

806

break

807

fp.seek(continuous_range.start_pos, 0)

808

for record_id, data in continuous_range.split(fp):

809

content, digest = self._parse_record(record_id, data)

810

yield record_id, content, digest

811

812

self._file = None

813

814

def read_records(self, records):

815

"""Read records into a dictionary."""

816

components = {}

817

for record_id, content, digest in self.read_records_iter(records):

818

components[record_id] = (content, digest)

819

return components

820

821

822

class InterKnit(InterVersionedFile):

823

"""Optimised code paths for knit to knit operations."""

824

825

_matching_file_factory = KnitVersionedFile

826

827

@staticmethod

828

def is_compatible(source, target):

829

"""Be compatible with knits. """

830

try:

831

return (isinstance(source, KnitVersionedFile) and

832

isinstance(target, KnitVersionedFile))

833

except AttributeError:

834

return False

835

836

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

837

"""See InterVersionedFile.join."""

838

assert isinstance(self.source, KnitVersionedFile)

839

assert isinstance(self.target, KnitVersionedFile)

840

841

if version_ids is None:

842

version_ids = self.source.versions()

843

else:

844

if not ignore_missing:

845

self.source._check_versions_present(version_ids)

846

else:

847

version_ids = set(self.source.versions()).intersection(

848

set(version_ids))

849

850

if not version_ids:

851

return 0

852

853

if pb is None:

854

from bzrlib.progress import DummyProgress

855

pb = DummyProgress()

856

857

version_ids = list(version_ids)

858

if None in version_ids:

859

version_ids.remove(None)

860

861

self.source_ancestry = set(self.source.get_ancestry(version_ids))

862

this_versions = set(self.target._index.get_versions())

863

needed_versions = self.source_ancestry - this_versions

864

cross_check_versions = self.source_ancestry.intersection(this_versions)

865

mismatched_versions = set()

866

for version in cross_check_versions:

867

# scan to include needed parents.

868

n1 = set(self.target.get_parents(version))

869

n2 = set(self.source.get_parents(version))

870

if n1 != n2:

871

# FIXME TEST this check for cycles being introduced works

872

# the logic is we have a cycle if in our graph we are an

873

# ancestor of any of the n2 revisions.

874

for parent in n2:

875

if parent in n1:

876

# safe

877

continue

878

else:

879

parent_ancestors = self.source.get_ancestry(parent)

880

if version in parent_ancestors:

881

raise errors.GraphCycleError([parent, version])

882

# ensure this parent will be available later.

883

new_parents = n2.difference(n1)

884

needed_versions.update(new_parents.difference(this_versions))

885

mismatched_versions.add(version)

886

887

if not needed_versions and not cross_check_versions:

888

return 0

889

full_list = topo_sort(self.source._index.get_graph())

890

891

version_list = [i for i in full_list if (not self.target.has_version(i)

892

and i in needed_versions)]

893

894

records = []

895

for version_id in version_list:

896

data_pos, data_size = self.source._index.get_position(version_id)

897

records.append((version_id, data_pos, data_size))

898

899

count = 0

900

for version_id, lines, digest \

901

in self.source._data.read_records_iter(records):

902

options = self.source._index.get_options(version_id)

903

parents = self.source._index.get_parents(version_id)

904

905

for parent in parents:

906

assert self.target.has_version(parent)

907

908

if self.target.factory.annotated:

909

# FIXME jrydberg: it should be possible to skip

910

# re-annotating components if we know that we are

911

# going to pull all revisions in the same order.

912

new_version_id = version_id

913

new_version_idx = self.target._index.num_versions()

914

if 'fulltext' in options:

915

lines = self.target._reannotate_fulltext(self.source, lines,

916

new_version_id, new_version_idx)

917

elif 'line-delta' in options:

918

lines = self.target._reannotate_line_delta(self.source, lines,

919

new_version_id, new_version_idx)

920

921

count = count + 1

922

pb.update("Joining knit", count, len(version_list))

923

924

pos, size = self.target._data.add_record(version_id, digest, lines)

925

self.target._index.add_version(version_id, options, pos, size, parents)

926

927

for version in mismatched_versions:

928

n1 = set(self.target.get_parents(version))

929

n2 = set(self.source.get_parents(version))

930

# write a combined record to our history.

931

new_parents = self.target.get_parents(version) + list(n2.difference(n1))

932

current_values = self.target._index._cache[version]

933

self.target._index.add_version(version,

934

current_values[1],

935

current_values[2],

936

current_values[3],

937

new_parents)

938

pb.clear()

939

return count

940

941

942

InterVersionedFile.register_optimiser(InterKnit)

Older »