~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Andrew Bennetts
Date: 2007-10-05 05:52:45 UTC
mfrom: (2890 +trunk)
mto: (2535.4.11 streaming-smart-fetch)
mto: This revision was merged to the branch mainline in revision 2906.
Revision ID: andrew.bennetts@canonical.com-20071005055245-5f8ldcqxp21zy8v8

Merge from bzr.dev.

files added:
bzrlib/_patiencediff_c.c

bzrlib/benchmarks/bench_pack.py

bzrlib/patiencediff.py

bzrlib/reconfigure.py

bzrlib/tests/blackbox/test_reconfigure.py

bzrlib/tests/blackbox/test_unknowns.py

bzrlib/tests/commands/test_commit.py

bzrlib/tests/commands/test_update.py

bzrlib/tests/test_reconfigure.py

bzrlib/tests/tree_implementations/test_path_content_summary.py

bzrlib/transport/trace.py

doc/developers/authentication-ring.txt

files renamed:
bzrlib/patiencediff.py => bzrlib/_patiencediff_py.py

files modified:
.bzrignore

Makefile

NEWS

bzrlib/__init__.py

bzrlib/annotate.py

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle/serializer/v4.py

bzrlib/bzrdir.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/debug.py

bzrlib/dirstate.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/graph.py

bzrlib/help_topics.py

bzrlib/index.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lsprof.py

bzrlib/mail_client.py

bzrlib/memorytree.py

bzrlib/merge.py

bzrlib/msgeditor.py

bzrlib/multiparent.py

bzrlib/mutabletree.py

bzrlib/osutils.py

bzrlib/pack.py

bzrlib/reconcile.py

bzrlib/registry.py

bzrlib/remote.py

bzrlib/repofmt/knitrepo.py

bzrlib/repofmt/weaverepo.py

bzrlib/repository.py

bzrlib/revisiontree.py

bzrlib/store/revision/knit.py

bzrlib/symbol_versioning.py

bzrlib/tag.py

bzrlib/tests/HttpServer.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_sprout.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/commands/__init__.py

bzrlib/tests/commands/test_branch.py

bzrlib/tests/commands/test_cat.py

bzrlib/tests/commands/test_checkout.py

bzrlib/tests/commands/test_init.py

bzrlib/tests/commands/test_init_repository.py

bzrlib/tests/commands/test_merge.py

bzrlib/tests/commands/test_missing.py

bzrlib/tests/commands/test_pull.py

bzrlib/tests/commands/test_push.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/inventory_implementations/basics.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fetch.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_config.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_ftp_transport.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_info.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lsprof.py

bzrlib/tests/test_mail_client.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_remote.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revert.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_tag.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_workingtree_4.py

bzrlib/tests/test_xml.py

bzrlib/tests/transport_util.py

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_inv.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_inv.py

bzrlib/tests/workingtree_implementations/test_parents.py

bzrlib/tests/workingtree_implementations/test_rename_one.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/trace.py

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http/__init__.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/remote.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tuned_gzip.py

bzrlib/version.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml6.py

bzrlib/xml_serializer.py

doc/developers/HACKING.txt

doc/en/user-guide/configuration.txt

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

101

100

RevisionNotPresent,

102

101

RevisionAlreadyPresent,

103

102

)

104

from bzrlib.tuned_gzip import GzipFile

103

from bzrlib.tuned_gzip import GzipFile, bytes_to_gzip

105

104

from bzrlib.osutils import (

106

105

contains_whitespace,

107

106

contains_linebreaks,

107

sha_string,

108

sha_strings,

109

)

110

from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed

134

class KnitContent(object):

135

"""Content of a knit version to which deltas can be applied."""

136

137

def __init__(self, lines):

138

self._lines = lines

139

140

def annotate_iter(self):

141

"""Yield tuples of (origin, text) for each content line."""

142

return iter(self._lines)

143

144

137

def annotate(self):

145

138

"""Return a list of (origin, text) tuples."""

146

139

return list(self.annotate_iter())

149

142

"""Generate line-based delta from this content to new_lines."""

150

143

new_texts = new_lines.text()

151

144

old_texts = self.text()

152

s = KnitSequenceMatcher(None, old_texts, new_texts)

145

s = patiencediff.PatienceSequenceMatcher(None, old_texts, new_texts)

153

146

for tag, i1, i2, j1, j2 in s.get_opcodes():

154

147

if tag == 'equal':

155

148

continue

159

152

def line_delta(self, new_lines):

160

153

return list(self.line_delta_iter(new_lines))

161

154

162

def text(self):

163

return [text for origin, text in self._lines]

164

165

def copy(self):

166

return KnitContent(self._lines[:])

167

168

155

@staticmethod

169

156

def get_line_delta_blocks(knit_delta, source, target):

170

157

"""Extract SequenceMatcher.get_matching_blocks() from a knit delta"""

192

179

yield s_pos + (target_len - t_pos), target_len, 0

193

180

194

181

195

class _KnitFactory(object):

196

"""Base factory for creating content objects."""

197

198

def make(self, lines, version_id):

199

num_lines = len(lines)

200

return KnitContent(zip([version_id] * num_lines, lines))

201

202

203

class KnitAnnotateFactory(_KnitFactory):

182

class AnnotatedKnitContent(KnitContent):

183

"""Annotated content."""

184

185

def __init__(self, lines):

186

self._lines = lines

187

188

def annotate_iter(self):

189

"""Yield tuples of (origin, text) for each content line."""

190

return iter(self._lines)

191

192

def strip_last_line_newline(self):

193

line = self._lines[-1][1].rstrip('\n')

194

self._lines[-1] = (self._lines[-1][0], line)

195

196

def text(self):

197

return [text for origin, text in self._lines]

198

199

def copy(self):

200

return AnnotatedKnitContent(self._lines[:])

201

202

203

class PlainKnitContent(KnitContent):

204

"""Unannotated content.

205

206

When annotate[_iter] is called on this content, the same version is reported

207

for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent

208

objects.

209

"""

210

211

def __init__(self, lines, version_id):

212

self._lines = lines

213

self._version_id = version_id

214

215

def annotate_iter(self):

216

"""Yield tuples of (origin, text) for each content line."""

217

for line in self._lines:

218

yield self._version_id, line

219

220

def copy(self):

221

return PlainKnitContent(self._lines[:], self._version_id)

222

223

def strip_last_line_newline(self):

224

self._lines[-1] = self._lines[-1].rstrip('\n')

225

226

def text(self):

227

return self._lines

228

229

230

class KnitAnnotateFactory(object):

204

231

"""Factory for creating annotated Content objects."""

205

232

206

233

annotated = True

207

234

235

def make(self, lines, version_id):

236

num_lines = len(lines)

237

return AnnotatedKnitContent(zip([version_id] * num_lines, lines))

238

208

239

def parse_fulltext(self, content, version_id):

209

240

"""Convert fulltext to internal representation

210

241

218

249

# Figure out a way to not require the overhead of turning the

219

250

# list back into tuples.

220

251

lines = [tuple(line.split(' ', 1)) for line in content]

221

return KnitContent(lines)

252

return AnnotatedKnitContent(lines)

222

253

223

254

def parse_line_delta_iter(self, lines):

224

255

return iter(self.parse_line_delta(lines))

225

256

226

def parse_line_delta(self, lines, version_id):

257

def parse_line_delta(self, lines, version_id, plain=False):

227

258

"""Convert a line based delta into internal representation.

228

259

229

260

line delta is in the form of:

232

263

revid(utf8) newline\n

233

264

internal representation is

234

265

(start, end, count, [1..count tuples (revid, newline)])

266

267

:param plain: If True, the lines are returned as a plain

268

list, not as a list of tuples, i.e.

269

(start, end, count, [1..count newline])

235

270

"""

236

271

result = []

237

272

lines = iter(lines)

243

278

return cache.setdefault(origin, origin), text

244

279

245

280

# walk through the lines parsing.

246

for header in lines:

247

start, end, count = [int(n) for n in header.split(',')]

248

contents = [tuple(next().split(' ', 1)) for i in xrange(count)]

249

result.append((start, end, count, contents))

281

# Note that the plain test is explicitly pulled out of the

282

# loop to minimise any performance impact

283

if plain:

284

for header in lines:

285

start, end, count = [int(n) for n in header.split(',')]

286

contents = [next().split(' ', 1)[1] for i in xrange(count)]

287

result.append((start, end, count, contents))

288

else:

289

for header in lines:

290

start, end, count = [int(n) for n in header.split(',')]

291

contents = [tuple(next().split(' ', 1)) for i in xrange(count)]

292

result.append((start, end, count, contents))

250

293

return result

251

294

252

295

def get_fulltext_content(self, lines):

296

339

return content.annotate_iter()

297

340

298

341

299

class KnitPlainFactory(_KnitFactory):

342

class KnitPlainFactory(object):

300

343

"""Factory for creating plain Content objects."""

301

344

302

345

annotated = False

303

346

347

def make(self, lines, version_id):

348

return PlainKnitContent(lines, version_id)

349

304

350

def parse_fulltext(self, content, version_id):

305

351

"""This parses an unannotated fulltext.

306

352

316

362

header = lines[cur]

317

363

cur += 1

318

364

start, end, c = [int(n) for n in header.split(',')]

319

yield start, end, c, zip([version_id] * c, lines[cur:cur+c])

365

yield start, end, c, lines[cur:cur+c]

320

366

cur += c

321

367

322

368

def parse_line_delta(self, lines, version_id):

347

393

out = []

348

394

for start, end, c, lines in delta:

349

395

out.append('%d,%d,%d\n' % (start, end, c))

350

out.extend([text for origin, text in lines])

396

out.extend(lines)

351

397

return out

352

398

353

399

def annotate_iter(self, knit, version_id):

375

421

"""

376

422

377

423

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

378

factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,

379

create=False, create_parent_dir=False, delay_create=False,

380

dir_mode=None, index=None, access_method=None):

424

factory=None, delta=True, create=False, create_parent_dir=False,

425

delay_create=False, dir_mode=None, index=None, access_method=None):

381

426

"""Construct a knit at location specified by relpath.

382

427

383

428

:param create: If not True, only open an existing knit.

388

433

actually be created until the first data is stored.

389

434

:param index: An index to use for the knit.

390

435

"""

391

if deprecated_passed(basis_knit):

392

warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"

393

" deprecated as of bzr 0.9.",

394

DeprecationWarning, stacklevel=2)

395

436

if access_mode is None:

396

437

access_mode = 'w'

397

438

super(KnitVersionedFile, self).__init__(access_mode)

421

462

self._data = _KnitData(_access)

422

463

423

464

def __repr__(self):

424

return '%s(%s)' % (self.__class__.__name__,

465

return '%s(%s)' % (self.__class__.__name__,

425

466

self.transport.abspath(self.filename))

426

467

427

468

def _check_should_delta(self, first_parents):

453

494

454

495

return fulltext_size > delta_size

455

496

456

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

457

"""See VersionedFile._add_delta()."""

458

self._check_add(version_id, []) # should we check the lines ?

459

self._check_versions_present(parents)

460

present_parents = []

461

ghosts = []

462

parent_texts = {}

463

for parent in parents:

464

if not self.has_version(parent):

465

ghosts.append(parent)

466

else:

467

present_parents.append(parent)

468

469

if delta_parent is None:

470

# reconstitute as full text.

471

assert len(delta) == 1 or len(delta) == 0

472

if len(delta):

473

assert delta[0][0] == 0

474

assert delta[0][1] == 0, delta[0][1]

475

return super(KnitVersionedFile, self)._add_delta(version_id,

476

parents,

477

delta_parent,

478

sha1,

479

noeol,

480

delta)

481

482

digest = sha1

483

484

options = []

485

if noeol:

486

options.append('no-eol')

487

488

if delta_parent is not None:

489

# determine the current delta chain length.

490

# To speed the extract of texts the delta chain is limited

491

# to a fixed number of deltas. This should minimize both

492

# I/O and the time spend applying deltas.

493

# The window was changed to a maximum of 200 deltas, but also added

494

# was a check that the total compressed size of the deltas is

495

# smaller than the compressed size of the fulltext.

496

if not self._check_should_delta([delta_parent]):

497

# We don't want a delta here, just do a normal insertion.

498

return super(KnitVersionedFile, self)._add_delta(version_id,

499

parents,

500

delta_parent,

501

sha1,

502

noeol,

503

delta)

504

505

options.append('line-delta')

506

store_lines = self.factory.lower_line_delta(delta)

507

508

access_memo = self._data.add_record(version_id, digest, store_lines)

509

self._index.add_version(version_id, options, access_memo, parents)

510

511

497

def _add_raw_records(self, records, data):

512

498

"""Add all the records 'records' with data pre-joined in 'data'.

513

499

556

542

return KnitVersionedFile(name, transport, factory=self.factory,

557

543

delta=self.delta, create=True)

558

544

559

def _fix_parents(self, version_id, new_parents):

560

"""Fix the parents list for version.

561

562

This is done by appending a new version to the index

563

with identical data except for the parents list.

564

the parents list must be a superset of the current

565

list.

566

"""

567

current_values = self._index._cache[version_id]

568

assert set(current_values[4]).difference(set(new_parents)) == set()

569

self._index.add_version(version_id,

570

current_values[1],

571

(None, current_values[2], current_values[3]),

572

new_parents)

573

574

545

def get_data_stream(self, required_versions):

575

546

"""Get a data stream for the specified versions.

576

547

642

613

"""Get a delta for constructing version from some other version."""

643

614

version_id = osutils.safe_revision_id(version_id)

644

615

self.check_not_reserved_id(version_id)

645

if not self.has_version(version_id):

646

raise RevisionNotPresent(version_id, self.filename)

647

648

616

parents = self.get_parents(version_id)

649

617

if len(parents):

650

618

parent = parents[0]

661

629

else:

662

630

old_texts = []

663

631

new_texts = new_content.text()

664

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

632

delta_seq = patiencediff.PatienceSequenceMatcher(None, old_texts,

633

new_texts)

665

634

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

666

635

else:

667

636

delta = self.factory.parse_line_delta(data, version_id)

851

820

def _get_content(self, version_id, parent_texts={}):

852

821

"""Returns a content object that makes up the specified

853

822

version."""

854

if not self.has_version(version_id):

855

raise RevisionNotPresent(version_id, self.filename)

856

857

823

cached_version = parent_texts.get(version_id, None)

858

824

if cached_version is not None:

825

if not self.has_version(version_id):

826

raise RevisionNotPresent(version_id, self.filename)

859

827

return cached_version

860

828

861

829

text_map, contents_map = self._get_content_maps([version_id])

865

833

"""Check that all specified versions are present."""

866

834

self._index.check_versions_present(version_ids)

867

835

868

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

836

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,

837

nostore_sha, random_id, check_content):

869

838

"""See VersionedFile.add_lines_with_ghosts()."""

870

self._check_add(version_id, lines)

871

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

839

self._check_add(version_id, lines, random_id, check_content)

840

return self._add(version_id, lines, parents, self.delta,

841

parent_texts, None, nostore_sha, random_id)

872

842

873

843

def _add_lines(self, version_id, parents, lines, parent_texts,

874

left_matching_blocks=None):

844

left_matching_blocks, nostore_sha, random_id, check_content):

875

845

"""See VersionedFile.add_lines."""

876

self._check_add(version_id, lines)

846

self._check_add(version_id, lines, random_id, check_content)

877

847

self._check_versions_present(parents)

878

848

return self._add(version_id, lines[:], parents, self.delta,

879

parent_texts, left_matching_blocks)

849

parent_texts, left_matching_blocks, nostore_sha, random_id)

880

850

881

def _check_add(self, version_id, lines):

851

def _check_add(self, version_id, lines, random_id, check_content):

882

852

"""check that version_id and lines are safe to add."""

883

assert self.writable, "knit is not opened for write"

884

### FIXME escape. RBC 20060228

885

853

if contains_whitespace(version_id):

886

854

raise InvalidRevisionId(version_id, self.filename)

887

855

self.check_not_reserved_id(version_id)

888

if self.has_version(version_id):

856

# Technically this could be avoided if we are happy to allow duplicate

857

# id insertion when other things than bzr core insert texts, but it

858

# seems useful for folk using the knit api directly to have some safety

859

# blanket that we can disable.

860

if not random_id and self.has_version(version_id):

889

861

raise RevisionAlreadyPresent(version_id, self.filename)

890

self._check_lines_not_unicode(lines)

891

self._check_lines_are_lines(lines)

862

if check_content:

863

self._check_lines_not_unicode(lines)

864

self._check_lines_are_lines(lines)

892

865

893

866

def _add(self, version_id, lines, parents, delta, parent_texts,

894

left_matching_blocks=None):

867

left_matching_blocks, nostore_sha, random_id):

895

868

"""Add a set of lines on top of version specified by parents.

896

869

897

870

If delta is true, compress the text as a line-delta against

899

872

900

873

Any versions not present will be converted into ghosts.

901

874

"""

902

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

903

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

904

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

905

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

906

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

907

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

908

# +1383 0 8.0370 8.0370 +<len>

909

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

910

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

911

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

912

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

875

# first thing, if the content is something we don't need to store, find

876

# that out.

877

line_bytes = ''.join(lines)

878

digest = sha_string(line_bytes)

879

if nostore_sha == digest:

880

raise errors.ExistingContent

913

881

914

882

present_parents = []

915

ghosts = []

916

883

if parent_texts is None:

917

884

parent_texts = {}

918

885

for parent in parents:

919

if not self.has_version(parent):

920

ghosts.append(parent)

921

else:

886

if self.has_version(parent):

922

887

present_parents.append(parent)

923

888

924

if delta and not len(present_parents):

889

# can only compress against the left most present parent.

890

if (delta and

891

(len(present_parents) == 0 or

892

present_parents[0] != parents[0])):

925

893

delta = False

926

894

927

digest = sha_strings(lines)

895

text_length = len(line_bytes)

928

896

options = []

929

897

if lines:

930

898

if lines[-1][-1] != '\n':

899

# copy the contents of lines.

900

lines = lines[:]

931

901

options.append('no-eol')

932

902

lines[-1] = lines[-1] + '\n'

933

903

934

if len(present_parents) and delta:

904

if delta:

935

905

# To speed the extract of texts the delta chain is limited

936

906

# to a fixed number of deltas. This should minimize both

937

907

# I/O and the time spend applying deltas.

938

908

delta = self._check_should_delta(present_parents)

939

909

940

910

assert isinstance(version_id, str)

941

lines = self.factory.make(lines, version_id)

911

content = self.factory.make(lines, version_id)

942

912

if delta or (self.factory.annotated and len(present_parents) > 0):

943

# Merge annotations from parent texts if so is needed.

944

delta_hunks = self._merge_annotations(lines, present_parents,

913

# Merge annotations from parent texts if needed.

914

delta_hunks = self._merge_annotations(content, present_parents,

945

915

parent_texts, delta, self.factory.annotated,

946

916

left_matching_blocks)

947

917

948

918

if delta:

949

919

options.append('line-delta')

950

920

store_lines = self.factory.lower_line_delta(delta_hunks)

921

size, bytes = self._data._record_to_data(version_id, digest,

922

store_lines)

951

923

else:

952

924

options.append('fulltext')

953

store_lines = self.factory.lower_fulltext(lines)

925

# get mixed annotation + content and feed it into the

926

# serialiser.

927

store_lines = self.factory.lower_fulltext(content)

928

size, bytes = self._data._record_to_data(version_id, digest,

929

store_lines)

954

930

955

access_memo = self._data.add_record(version_id, digest, store_lines)

956

self._index.add_version(version_id, options, access_memo, parents)

957

return lines

931

access_memo = self._data.add_raw_records([size], bytes)[0]

932

self._index.add_versions(

933

((version_id, options, access_memo, parents),),

934

random_id=random_id)

935

return digest, text_length, content

958

936

959

937

def check(self, progress_bar=None):

960

938

"""See VersionedFile.check()."""

1015

993

the requested versions and content_map contains the KnitContents.

1016

994

Both dicts take version_ids as their keys.

1017

995

"""

1018

for version_id in version_ids:

1019

if not self.has_version(version_id):

1020

raise RevisionNotPresent(version_id, self.filename)

1021

996

record_map = self._get_record_map(version_ids)

1022

997

1023

998

text_map = {}

1044

1019

elif method == 'line-delta':

1045

1020

delta = self.factory.parse_line_delta(data, version_id)

1046

1021

content = content.copy()

1047

content._lines = self._apply_delta(content._lines,

1022

content._lines = self._apply_delta(content._lines,

1048

1023

delta)

1049

1024

content_map[component_id] = content

1050

1025

1051

1026

if 'no-eol' in self._index.get_options(version_id):

1052

1027

content = content.copy()

1053

line = content._lines[-1][1].rstrip('\n')

1054

content._lines[-1] = (content._lines[-1][0], line)

1028

content.strip_last_line_newline()

1055

1029

final_content[version_id] = content

1056

1030

1057

1031

# digest here is the digest from the last applied component.

1058

1032

text = content.text()

1059

1033

if sha_strings(text) != digest:

1060

raise KnitCorrupt(self.filename,

1034

raise KnitCorrupt(self.filename,

1061

1035

'sha-1 does not match %s' % version_id)

1062

1036

1063

text_map[version_id] = text

1064

return text_map, final_content

1037

text_map[version_id] = text

1038

return text_map, final_content

1039

1040

@staticmethod

1041

def _apply_delta(lines, delta):

1042

"""Apply delta to lines."""

1043

lines = list(lines)

1044

offset = 0

1045

for start, end, count, delta_lines in delta:

1046

lines[offset+start:offset+end] = delta_lines

1047

offset = offset + (start - end) + count

1048

return lines

1065

1049

1066

1050

def iter_lines_added_or_present_in_versions(self, version_ids=None,

1067

1051

pb=None):

1400

1384

"""Add a version record to the index."""

1401

1385

self.add_versions(((version_id, options, index_memo, parents),))

1402

1386

1403

def add_versions(self, versions):

1387

def add_versions(self, versions, random_id=False):

1404

1388

"""Add multiple versions to the index.

1405

1389

1406

1390

:param versions: a list of tuples:

1407

1391

(version_id, options, pos, size, parents).

1392

:param random_id: If True the ids being added were randomly generated

1393

and no check for existence will be performed.

1408

1394

"""

1409

1395

lines = []

1410

1396

orig_history = self._history[:]

1457

1443

1458

1444

def get_method(self, version_id):

1459

1445

"""Return compression method of specified version."""

1460

options = self._cache[version_id][1]

1446

try:

1447

options = self._cache[version_id][1]

1448

except KeyError:

1449

raise RevisionNotPresent(version_id, self._filename)

1461

1450

if 'fulltext' in options:

1462

1451

return 'fulltext'

1463

1452

else:

1696

1685

return 'fulltext'

1697

1686

1698

1687

def _get_node(self, version_id):

1699

return list(self._get_entries(self._version_ids_to_keys([version_id])))[0]

1688

try:

1689

return list(self._get_entries(self._version_ids_to_keys([version_id])))[0]

1690

except IndexError:

1691

raise RevisionNotPresent(version_id, self)

1700

1692

1701

1693

def get_options(self, version_id):

1702

1694

"""Return a string represention options.

1740

1732

"""Add a version record to the index."""

1741

1733

return self.add_versions(((version_id, options, access_memo, parents),))

1742

1734

1743

def add_versions(self, versions):

1735

def add_versions(self, versions, random_id=False):

1744

1736

"""Add multiple versions to the index.

1745

1737

1746

1738

This function does not insert data into the Immutable GraphIndex

1750

1742

1751

1743

:param versions: a list of tuples:

1752

1744

(version_id, options, pos, size, parents).

1745

:param random_id: If True the ids being added were randomly generated

1746

and no check for existence will be performed.

1753

1747

"""

1754

1748

if not self._add_callback:

1755

1749

raise errors.ReadOnlyError(self)

1784

1778

"in parentless index.")

1785

1779

node_refs = ()

1786

1780

keys[key] = (value, node_refs)

1787

present_nodes = self._get_entries(keys)

1788

for (index, key, value, node_refs) in present_nodes:

1789

if (value, node_refs) != keys[key]:

1790

raise KnitCorrupt(self, "inconsistent details in add_versions"

1791

": %s %s" % ((value, node_refs), keys[key]))

1792

del keys[key]

1781

if not random_id:

1782

present_nodes = self._get_entries(keys)

1783

for (index, key, value, node_refs) in present_nodes:

1784

if (value, node_refs) != keys[key]:

1785

raise KnitCorrupt(self, "inconsistent details in add_versions"

1786

": %s %s" % ((value, node_refs), keys[key]))

1787

del keys[key]

1793

1788

result = []

1794

1789

if self._parents:

1795

1790

for key, (value, node_refs) in keys.iteritems():

2009

2004

2010

2005

:return: (len, a StringIO instance with the raw data ready to read.)

2011

2006

"""

2012

sio = StringIO()

2013

data_file = GzipFile(None, mode='wb', fileobj=sio,

2014

compresslevel=Z_DEFAULT_COMPRESSION)

2015

2016

assert isinstance(version_id, str)

2017

data_file.writelines(chain(

2007

bytes = (''.join(chain(

2018

2008

["version %s %d %s\n" % (version_id,

2019

2009

len(lines),

2020

2010

digest)],

2021

2011

lines,

2022

["end %s\n" % version_id]))

2023

data_file.close()

2024

length= sio.tell()

2025

2026

sio.seek(0)

2027

return length, sio

2012

["end %s\n" % version_id])))

2013

assert bytes.__class__ == str

2014

compressed_bytes = bytes_to_gzip(bytes)

2015

return len(compressed_bytes), compressed_bytes

2028

2016

2029

2017

def add_raw_records(self, sizes, raw_data):

2030

2018

"""Append a prepared record to the data file.

2037

2025

"""

2038

2026

return self._access.add_raw_records(sizes, raw_data)

2039

2027

2040

def add_record(self, version_id, digest, lines):

2041

"""Write new text record to disk.

2042

2043

Returns index data for retrieving it later, as per add_raw_records.

2044

"""

2045

size, sio = self._record_to_data(version_id, digest, lines)

2046

result = self.add_raw_records([size], sio.getvalue())

2047

if self._do_cache:

2048

self._cache[version_id] = sio.getvalue()

2049

return result[0]

2050

2051

2028

def _parse_record_header(self, version_id, raw_data):

2052

2029

"""Parse a record header for consistency.

2053

2030

2214

2191

assert isinstance(self.source, KnitVersionedFile)

2215

2192

assert isinstance(self.target, KnitVersionedFile)

2216

2193

2194

# If the source and target are mismatched w.r.t. annotations vs

2195

# plain, the data needs to be converted accordingly

2196

if self.source.factory.annotated == self.target.factory.annotated:

2197

converter = None

2198

elif self.source.factory.annotated:

2199

converter = self._anno_to_plain_converter

2200

else:

2201

# We're converting from a plain to an annotated knit. This requires

2202

# building the annotations from scratch. The generic join code

2203

# handles this implicitly so we delegate to it.

2204

return super(InterKnit, self).join(pb, msg, version_ids,

2205

ignore_missing)

2206

2217

2207

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

2218

2219

2208

if not version_ids:

2220

2209

return 0

2221

2210

2227

2216

2228

2217

self.source_ancestry = set(self.source.get_ancestry(version_ids))

2229

2218

this_versions = set(self.target._index.get_versions())

2219

# XXX: For efficiency we should not look at the whole index,

2220

# we only need to consider the referenced revisions - they

2221

# must all be present, or the method must be full-text.

2222

# TODO, RBC 20070919

2230

2223

needed_versions = self.source_ancestry - this_versions

2231

cross_check_versions = self.source_ancestry.intersection(this_versions)

2232

mismatched_versions = set()

2233

for version in cross_check_versions:

2234

# scan to include needed parents.

2235

n1 = set(self.target.get_parents_with_ghosts(version))

2236

n2 = set(self.source.get_parents_with_ghosts(version))

2237

if n1 != n2:

2238

# FIXME TEST this check for cycles being introduced works

2239

# the logic is we have a cycle if in our graph we are an

2240

# ancestor of any of the n2 revisions.

2241

for parent in n2:

2242

if parent in n1:

2243

# safe

2244

continue

2245

else:

2246

parent_ancestors = self.source.get_ancestry(parent)

2247

if version in parent_ancestors:

2248

raise errors.GraphCycleError([parent, version])

2249

# ensure this parent will be available later.

2250

new_parents = n2.difference(n1)

2251

needed_versions.update(new_parents.difference(this_versions))

2252

mismatched_versions.add(version)

2253

2224

2254

if not needed_versions and not mismatched_versions:

2225

if not needed_versions:

2255

2226

return 0

2256

2227

full_list = topo_sort(self.source.get_graph())

2257

2228

2291

2262

assert version_id == version_id2, 'logic error, inconsistent results'

2292

2263

count = count + 1

2293

2264

pb.update("Joining knit", count, total)

2294

raw_records.append((version_id, options, parents, len(raw_data)))

2265

if converter:

2266

size, raw_data = converter(raw_data, version_id, options,

2267

parents)

2268

else:

2269

size = len(raw_data)

2270

raw_records.append((version_id, options, parents, size))

2295

2271

raw_datum.append(raw_data)

2296

2272

self.target._add_raw_records(raw_records, ''.join(raw_datum))

2297

2298

for version in mismatched_versions:

2299

# FIXME RBC 20060309 is this needed?

2300

n1 = set(self.target.get_parents_with_ghosts(version))

2301

n2 = set(self.source.get_parents_with_ghosts(version))

2302

# write a combined record to our history preserving the current

2303

# parents as first in the list

2304

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

2305

self.target.fix_parents(version, new_parents)

2306

2273

return count

2307

2274

finally:

2308

2275

pb.finished()

2309

2276

2277

def _anno_to_plain_converter(self, raw_data, version_id, options,

2278

parents):

2279

"""Convert annotated content to plain content."""

2280

data, digest = self.source._data._parse_record(version_id, raw_data)

2281

if 'fulltext' in options:

2282

content = self.source.factory.parse_fulltext(data, version_id)

2283

lines = self.target.factory.lower_fulltext(content)

2284

else:

2285

delta = self.source.factory.parse_line_delta(data, version_id,

2286

plain=True)

2287

lines = self.target.factory.lower_line_delta(delta)

2288

return self.target._data._record_to_data(version_id, digest, lines)

2289

2310

2290

2311

2291

InterVersionedFile.register_optimiser(InterKnit)

2312

2292

2343

2323

self.source_ancestry = set(self.source.get_ancestry(version_ids))

2344

2324

this_versions = set(self.target._index.get_versions())

2345

2325

needed_versions = self.source_ancestry - this_versions

2346

cross_check_versions = self.source_ancestry.intersection(this_versions)

2347

mismatched_versions = set()

2348

for version in cross_check_versions:

2349

# scan to include needed parents.

2350

n1 = set(self.target.get_parents_with_ghosts(version))

2351

n2 = set(self.source.get_parents(version))

2352

# if all of n2's parents are in n1, then its fine.

2353

if n2.difference(n1):

2354

# FIXME TEST this check for cycles being introduced works

2355

# the logic is we have a cycle if in our graph we are an

2356

# ancestor of any of the n2 revisions.

2357

for parent in n2:

2358

if parent in n1:

2359

# safe

2360

continue

2361

else:

2362

parent_ancestors = self.source.get_ancestry(parent)

2363

if version in parent_ancestors:

2364

raise errors.GraphCycleError([parent, version])

2365

# ensure this parent will be available later.

2366

new_parents = n2.difference(n1)

2367

needed_versions.update(new_parents.difference(this_versions))

2368

mismatched_versions.add(version)

2369

2326

2370

if not needed_versions and not mismatched_versions:

2327

if not needed_versions:

2371

2328

return 0

2372

2329

full_list = topo_sort(self.source.get_graph())

2373

2330

2387

2344

self.target.add_lines(

2388

2345

version_id, parents, self.source.get_lines(version_id))

2389

2346

count = count + 1

2390

2391

for version in mismatched_versions:

2392

# FIXME RBC 20060309 is this needed?

2393

n1 = set(self.target.get_parents_with_ghosts(version))

2394

n2 = set(self.source.get_parents(version))

2395

# write a combined record to our history preserving the current

2396

# parents as first in the list

2397

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

2398

self.target.fix_parents(version, new_parents)

2399

2347

return count

2400

2348

finally:

2401

2349

pb.finished()

2404

2352

InterVersionedFile.register_optimiser(WeaveToKnit)

2405

2353

2406

2354

2407

class KnitSequenceMatcher(difflib.SequenceMatcher):

2408

"""Knit tuned sequence matcher.

2409

2410

This is based on profiling of difflib which indicated some improvements

2411

for our usage pattern.

2412

"""

2413

2414

def find_longest_match(self, alo, ahi, blo, bhi):

2415

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

2416

2417

If isjunk is not defined:

2418

2419

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

2420

alo <= i <= i+k <= ahi

2421

blo <= j <= j+k <= bhi

2422

and for all (i',j',k') meeting those conditions,

2423

k >= k'

2424

i <= i'

2425

and if i == i', j <= j'

2426

2427

In other words, of all maximal matching blocks, return one that

2428

starts earliest in a, and of all those maximal matching blocks that

2429

start earliest in a, return the one that starts earliest in b.

2430

2431

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

2432

>>> s.find_longest_match(0, 5, 0, 9)

2433

(0, 4, 5)

2434

2435

If isjunk is defined, first the longest matching block is

2436

determined as above, but with the additional restriction that no

2437

junk element appears in the block. Then that block is extended as

2438

far as possible by matching (only) junk elements on both sides. So

2439

the resulting block never matches on junk except as identical junk

2440

happens to be adjacent to an "interesting" match.

2441

2442

Here's the same example as before, but considering blanks to be

2443

junk. That prevents " abcd" from matching the " abcd" at the tail

2444

end of the second sequence directly. Instead only the "abcd" can

2445

match, and matches the leftmost "abcd" in the second sequence:

2446

2447

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

2448

>>> s.find_longest_match(0, 5, 0, 9)

2449

(1, 0, 4)

2450

2451

If no blocks match, return (alo, blo, 0).

2452

2453

>>> s = SequenceMatcher(None, "ab", "c")

2454

>>> s.find_longest_match(0, 2, 0, 1)

2455

(0, 0, 0)

2456

"""

2457

2458

# CAUTION: stripping common prefix or suffix would be incorrect.

2459

# E.g.,

2460

# ab

2461

# acab

2462

# Longest matching block is "ab", but if common prefix is

2463

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

2464

# strip, so ends up claiming that ab is changed to acab by

2465

# inserting "ca" in the middle. That's minimal but unintuitive:

2466

# "it's obvious" that someone inserted "ac" at the front.

2467

# Windiff ends up at the same place as diff, but by pairing up

2468

# the unique 'b's and then matching the first two 'a's.

2469

2470

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

2471

besti, bestj, bestsize = alo, blo, 0

2472

# find longest junk-free match

2473

# during an iteration of the loop, j2len[j] = length of longest

2474

# junk-free match ending with a[i-1] and b[j]

2475

j2len = {}

2476

# nothing = []

2477

b2jget = b2j.get

2478

for i in xrange(alo, ahi):

2479

# look at all instances of a[i] in b; note that because

2480

# b2j has no junk keys, the loop is skipped if a[i] is junk

2481

j2lenget = j2len.get

2482

newj2len = {}

2483

2484

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

2485

# following improvement

2486

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

2487

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

2488

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

2489

# to

2490

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

2491

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

2492

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

2493

2494

try:

2495

js = b2j[a[i]]

2496

except KeyError:

2497

pass

2498

else:

2499

for j in js:

2500

# a[i] matches b[j]

2501

if j >= blo:

2502

if j >= bhi:

2503

break

2504

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

2505

if k > bestsize:

2506

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

2507

j2len = newj2len

2508

2509

# Extend the best by non-junk elements on each end. In particular,

2510

# "popular" non-junk elements aren't in b2j, which greatly speeds

2511

# the inner loop above, but also means "the best" match so far

2512

# doesn't contain any junk *or* popular non-junk elements.

2513

while besti > alo and bestj > blo and \

2514

not isbjunk(b[bestj-1]) and \

2515

a[besti-1] == b[bestj-1]:

2516

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2517

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2518

not isbjunk(b[bestj+bestsize]) and \

2519

a[besti+bestsize] == b[bestj+bestsize]:

2520

bestsize += 1

2521

2522

# Now that we have a wholly interesting match (albeit possibly

2523

# empty!), we may as well suck up the matching junk on each

2524

# side of it too. Can't think of a good reason not to, and it

2525

# saves post-processing the (possibly considerable) expense of

2526

# figuring out what to do with it. In the case of an empty

2527

# interesting match, this is clearly the right thing to do,

2528

# because no other kind of match is possible in the regions.

2529

while besti > alo and bestj > blo and \

2530

isbjunk(b[bestj-1]) and \

2531

a[besti-1] == b[bestj-1]:

2532

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2533

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2534

isbjunk(b[bestj+bestsize]) and \

2535

a[besti+bestsize] == b[bestj+bestsize]:

2536

bestsize = bestsize + 1

2537

2538

return besti, bestj, bestsize

2355

# Deprecated, use PatienceSequenceMatcher instead

2356

KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher

2539

2357

2540

2358

2541

2359

def annotate_knit(knit, revision_id):

Older »