~bzr-pqm/bzr/bzr.dev : revision 1563.2.4

1

2

# Written by Martin Pool.

3

# Modified by Johan Rydberg <jrydberg@gnu.org>

4

# Modified by Robert Collins <robert.collins@canonical.com>

5

#

6

# This program is free software; you can redistribute it and/or modify

7

# it under the terms of the GNU General Public License as published by

8

# the Free Software Foundation; either version 2 of the License, or

9

# (at your option) any later version.

10

#

11

# This program is distributed in the hope that it will be useful,

12

# but WITHOUT ANY WARRANTY; without even the implied warranty of

13

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14

# GNU General Public License for more details.

15

#

16

# You should have received a copy of the GNU General Public License

17

# along with this program; if not, write to the Free Software

18

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

19

20

"""Knit versionedfile implementation.

21

22

A knit is a versioned file implementation that supports efficient append only

23

updates.

24

"""

25

26

import os

27

import difflib

28

from difflib import SequenceMatcher

29

30

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

31

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

32

RevisionNotPresent, RevisionAlreadyPresent

33

from bzrlib.trace import mutter

34

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

35

sha_strings

36

from bzrlib.versionedfile import VersionedFile

37

from bzrlib.tsort import topo_sort

38

39

from StringIO import StringIO

40

from gzip import GzipFile

41

import sha

42

43

# TODO: Split out code specific to this format into an associated object.

44

45

# TODO: Can we put in some kind of value to check that the index and data

46

# files belong together?

47

48

# TODO: accomodate binaries, perhaps by storing a byte count

49

50

# TODO: function to check whole file

51

52

# TODO: atomically append data, then measure backwards from the cursor

53

# position after writing to work out where it was located. we may need to

54

# bypass python file buffering.

55

56

DATA_SUFFIX = '.knit'

57

INDEX_SUFFIX = '.kndx'

58

59

60

class KnitContent(object):

61

"""Content of a knit version to which deltas can be applied."""

62

63

def __init__(self, lines):

64

self._lines = lines

65

66

def annotate_iter(self):

67

"""Yield tuples of (origin, text) for each content line."""

68

for origin, text in self._lines:

69

yield origin, text

70

71

def annotate(self):

72

"""Return a list of (origin, text) tuples."""

73

return list(self.annotate_iter())

74

75

def apply_delta(self, delta):

76

"""Apply delta to this content."""

77

offset = 0

78

for start, end, count, lines in delta:

79

self._lines[offset+start:offset+end] = lines

80

offset = offset + (start - end) + count

81

82

def line_delta_iter(self, new_lines):

83

"""Generate line-based delta from new_lines to this content."""

84

new_texts = [text for origin, text in new_lines._lines]

85

old_texts = [text for origin, text in self._lines]

86

s = difflib.SequenceMatcher(None, old_texts, new_texts)

87

for op in s.get_opcodes():

88

if op[0] == 'equal':

89

continue

90

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

91

92

def line_delta(self, new_lines):

93

return list(self.line_delta_iter(new_lines))

94

95

def text(self):

96

return [text for origin, text in self._lines]

97

98

99

class _KnitFactory(object):

100

"""Base factory for creating content objects."""

101

102

def make(self, lines, version):

103

num_lines = len(lines)

104

return KnitContent(zip([version] * num_lines, lines))

105

106

107

class KnitAnnotateFactory(_KnitFactory):

108

"""Factory for creating annotated Content objects."""

109

110

annotated = True

111

112

def parse_fulltext(self, content, version):

113

lines = []

114

for line in content:

115

origin, text = line.split(' ', 1)

116

lines.append((int(origin), text))

117

return KnitContent(lines)

118

119

def parse_line_delta_iter(self, lines):

120

while lines:

121

header = lines.pop(0)

122

start, end, c = [int(n) for n in header.split(',')]

123

contents = []

124

for i in range(c):

125

origin, text = lines.pop(0).split(' ', 1)

126

contents.append((int(origin), text))

127

yield start, end, c, contents

128

129

def parse_line_delta(self, lines, version):

130

return list(self.parse_line_delta_iter(lines))

131

132

def lower_fulltext(self, content):

133

return ['%d %s' % (o, t) for o, t in content._lines]

134

135

def lower_line_delta(self, delta):

136

out = []

137

for start, end, c, lines in delta:

138

out.append('%d,%d,%d\n' % (start, end, c))

139

for origin, text in lines:

140

out.append('%d %s' % (origin, text))

141

return out

142

143

144

class KnitPlainFactory(_KnitFactory):

145

"""Factory for creating plain Content objects."""

146

147

annotated = False

148

149

def parse_fulltext(self, content, version):

150

return self.make(content, version)

151

152

def parse_line_delta_iter(self, lines, version):

153

while lines:

154

header = lines.pop(0)

155

start, end, c = [int(n) for n in header.split(',')]

156

yield start, end, c, zip([version] * c, lines[:c])

157

del lines[:c]

158

159

def parse_line_delta(self, lines, version):

160

return list(self.parse_line_delta_iter(lines, version))

161

162

def lower_fulltext(self, content):

163

return content.text()

164

165

def lower_line_delta(self, delta):

166

out = []

167

for start, end, c, lines in delta:

168

out.append('%d,%d,%d\n' % (start, end, c))

169

out.extend([text for origin, text in lines])

170

return out

171

172

173

def make_empty_knit(transport, relpath):

174

"""Construct a empty knit at the specified location."""

175

from bzrlib.transactions import PassThroughTransaction

176

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory, \

177

PassThroughTransaction())

178

k._data._open_file()

179

180

181

class KnitVersionedFile(VersionedFile):

182

"""Weave-like structure with faster random access.

183

184

A knit stores a number of texts and a summary of the relationships

185

between them. Texts are identified by a string version-id. Texts

186

are normally stored and retrieved as a series of lines, but can

187

also be passed as single strings.

188

189

Lines are stored with the trailing newline (if any) included, to

190

avoid special cases for files with no final newline. Lines are

191

composed of 8-bit characters, not unicode. The combination of

192

these approaches should mean any 'binary' file can be safely

193

stored and retrieved.

194

"""

195

196

def __init__(self, transport, relpath, mode, factory, transaction,

197

basis_knit=None, delta=True):

198

"""Construct a knit at location specified by relpath. The

199

given transaction will be used throughout the lifetime of the

200

knit.

201

"""

202

assert mode in ('r', 'w'), "invalid mode specified"

203

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

204

type(basis_knit)

205

206

self.transaction = transaction

207

self.transport = transport

208

self.filename = relpath

209

self.basis_knit = basis_knit

210

self.factory = factory

211

self.writable = (mode == 'w')

212

self.delta = delta

213

214

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

215

mode, transaction)

216

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

217

mode, transaction)

218

219

def versions(self):

220

"""See VersionedFile.versions."""

221

return self._index.get_versions()

222

223

def has_version(self, version_id):

224

"""See VersionedFile.has_version."""

225

return self._index.has_version(version_id)

226

227

__contains__ = has_version

228

229

def _merge_annotations(self, content, parents):

230

"""Merge annotations for content. This is done by comparing

231

the annotations based on changed to the text."""

232

for parent_id in parents:

233

merge_content = self._get_content(parent_id)

234

seq = SequenceMatcher(None, merge_content.text(), content.text())

235

for i, j, n in seq.get_matching_blocks():

236

if n == 0:

237

continue

238

content._lines[j:j+n] = merge_content._lines[i:i+n]

239

240

def _get_components(self, version_id):

241

"""Return a list of (version_id, method, data) tuples that

242

makes up version specified by version_id of the knit.

243

244

The components should be applied in the order of the returned

245

list.

246

247

The basis knit will be used to the largest extent possible

248

since it is assumed that accesses to it is faster.

249

"""

250

# needed_revisions holds a list of (method, version_id) of

251

# versions that is needed to be fetched to construct the final

252

# version of the file.

253

#

254

# basis_revisions is a list of versions that needs to be

255

# fetched but exists in the basis knit.

256

257

basis = self.basis_knit

258

needed_versions = []

259

basis_versions = []

260

cursor = version_id

261

262

while 1:

263

picked_knit = self

264

if basis and basis._index.has_version(cursor):

265

picked_knit = basis

266

basis_versions.append(cursor)

267

method = picked_knit._index.get_method(cursor)

268

needed_versions.append((method, cursor))

269

if method == 'fulltext':

270

break

271

cursor = picked_knit.get_parents(cursor)[0]

272

273

components = {}

274

if basis_versions:

275

records = []

276

for comp_id in basis_versions:

277

data_pos, data_size = basis._index.get_data_position(comp_id)

278

records.append((piece_id, data_pos, data_size))

279

components.update(basis._data.read_records(records))

280

281

records = []

282

for comp_id in [vid for method, vid in needed_versions

283

if vid not in basis_versions]:

284

data_pos, data_size = self._index.get_position(comp_id)

285

records.append((comp_id, data_pos, data_size))

286

components.update(self._data.read_records(records))

287

288

# get_data_records returns a mapping with the version id as

289

# index and the value as data. The order the components need

290

# to be applied is held by needed_versions (reversed).

291

out = []

292

for method, comp_id in reversed(needed_versions):

293

out.append((comp_id, method, components[comp_id]))

294

295

return out

296

297

def _get_content(self, version_id):

298

"""Returns a content object that makes up the specified

299

version."""

300

if not self.has_version(version_id):

301

raise RevisionNotPresent(version_id, self.filename)

302

303

if self.basis_knit and version_id in self.basis_knit:

304

return self.basis_knit._get_content(version_id)

305

306

content = None

307

components = self._get_components(version_id)

308

for component_id, method, (data, digest) in components:

309

version_idx = self._index.lookup(component_id)

310

if method == 'fulltext':

311

assert content is None

312

content = self.factory.parse_fulltext(data, version_idx)

313

elif method == 'line-delta':

314

delta = self.factory.parse_line_delta(data, version_idx)

315

content.apply_delta(delta)

316

317

if 'no-eol' in self._index.get_options(version_id):

318

line = content._lines[-1][1].rstrip('\n')

319

content._lines[-1] = (content._lines[-1][0], line)

320

321

if sha_strings(content.text()) != digest:

322

raise KnitCorrupt(self.filename, 'sha-1 does not match')

323

324

return content

325

326

def clone_text(self, new_version_id, old_version_id, parents,

327

transaction):

328

"""See VersionedFile.clone_text."""

329

raise NotImplementedError

330

331

def _check_versions_present(self, version_ids):

332

"""Check that all specified versions are present."""

333

version_ids = set(version_ids)

334

for r in list(version_ids):

335

if self._index.has_version(r):

336

version_ids.remove(r)

337

if version_ids:

338

raise RevisionNotPresent(list(version_ids)[0], self.filename)

339

340

def add_lines(self, version_id, parents, lines):

341

"""See VersionedFile.add_lines."""

342

assert self.writable, "knit is not opened for write"

343

### FIXME escape. RBC 20060228

344

if contains_whitespace(version_id):

345

raise InvalidRevisionId(version_id)

346

if self.has_version(version_id):

347

raise RevisionAlreadyPresent(version_id, self.filename)

348

349

if True or __debug__:

350

for l in lines:

351

assert '\n' not in l[:-1]

352

353

self._check_versions_present(parents)

354

return self._add(version_id, lines[:], parents, self.delta)

355

356

def _add(self, version_id, lines, parents, delta):

357

"""Add a set of lines on top of version specified by parents.

358

359

If delta is true, compress the text as a line-delta against

360

the first parent.

361

"""

362

if delta and not parents:

363

delta = False

364

365

digest = sha_strings(lines)

366

options = []

367

if lines:

368

if lines[-1][-1] != '\n':

369

options.append('no-eol')

370

lines[-1] = lines[-1] + '\n'

371

372

lines = self.factory.make(lines, len(self._index))

373

if self.factory.annotated and len(parents) > 0:

374

# Merge annotations from parent texts if so is needed.

375

self._merge_annotations(lines, parents)

376

377

if parents and delta:

378

# To speed the extract of texts the delta chain is limited

379

# to a fixed number of deltas. This should minimize both

380

# I/O and the time spend applying deltas.

381

count = 0

382

delta_parents = parents

383

while count < 25:

384

parent = delta_parents[0]

385

method = self._index.get_method(parent)

386

if method == 'fulltext':

387

break

388

delta_parents = self._index.get_parents(parent)

389

count = count + 1

390

if method == 'line-delta':

391

delta = False

392

393

if delta:

394

options.append('line-delta')

395

content = self._get_content(parents[0])

396

delta_hunks = content.line_delta(lines)

397

store_lines = self.factory.lower_line_delta(delta_hunks)

398

else:

399

options.append('fulltext')

400

store_lines = self.factory.lower_fulltext(lines)

401

402

where, size = self._data.add_record(version_id, digest, store_lines)

403

self._index.add_version(version_id, options, where, size, parents)

404

405

def clone_text(self, new_version_id, old_version_id, parents, transaction):

406

"""See VersionedFile.clone_text()."""

407

# FIXME RBC 20060228 make fast by only inserting an index with null delta.

408

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

409

410

def get_lines(self, version_id):

411

"""See VersionedFile.get_lines()."""

412

return self._get_content(version_id).text()

413

414

def annotate_iter(self, version_id):

415

"""See VersionedFile.annotate_iter."""

416

content = self._get_content(version_id)

417

for origin, text in content.annotate_iter():

418

yield self._index.idx_to_name(origin), text

419

420

def get_parents(self, version_id):

421

"""See VersionedFile.get_parents."""

422

self._check_versions_present([version_id])

423

return list(self._index.get_parents(version_id))

424

425

def get_ancestry(self, versions):

426

"""See VersionedFile.get_ancestry."""

427

if isinstance(versions, basestring):

428

versions = [versions]

429

if not versions:

430

return []

431

self._check_versions_present(versions)

432

return self._index.get_ancestry(versions)

433

434

def _reannotate_line_delta(self, other, lines, new_version_id,

435

new_version_idx):

436

"""Re-annotate line-delta and return new delta."""

437

new_delta = []

438

for start, end, count, contents \

439

in self.factory.parse_line_delta_iter(lines):

440

new_lines = []

441

for origin, line in contents:

442

old_version_id = other._index.idx_to_name(origin)

443

if old_version_id == new_version_id:

444

idx = new_version_idx

445

else:

446

idx = self._index.lookup(old_version_id)

447

new_lines.append((idx, line))

448

new_delta.append((start, end, count, new_lines))

449

450

return self.factory.lower_line_delta(new_delta)

451

452

def _reannotate_fulltext(self, other, lines, new_version_id,

453

new_version_idx):

454

"""Re-annotate fulltext and return new version."""

455

content = self.factory.parse_fulltext(lines, new_version_idx)

456

new_lines = []

457

for origin, line in content.annotate_iter():

458

old_version_id = other._index.idx_to_name(origin)

459

if old_version_id == new_version_id:

460

idx = new_version_idx

461

else:

462

idx = self._index.lookup(old_version_id)

463

new_lines.append((idx, line))

464

465

return self.factory.lower_fulltext(KnitContent(new_lines))

466

467

def join(self, other, pb=None, msg=None, version_ids=None):

468

"""See VersionedFile.join."""

469

assert isinstance(other, KnitVersionedFile)

470

471

if version_ids is None:

472

version_ids = other.versions()

473

if not version_ids:

474

return 0

475

476

if pb is None:

477

from bzrlib.progress import DummyProgress

478

pb = DummyProgress()

479

480

version_ids = list(version_ids)

481

if None in version_ids:

482

version_ids.remove(None)

483

484

other_ancestry = set(other.get_ancestry(version_ids))

485

needed_versions = other_ancestry - set(self._index.get_versions())

486

if not needed_versions:

487

return 0

488

full_list = topo_sort(other._index.get_graph())

489

490

version_list = [i for i in full_list if (not self.has_version(i)

491

and i in needed_versions)]

492

493

records = []

494

for version_id in version_list:

495

data_pos, data_size = other._index.get_position(version_id)

496

records.append((version_id, data_pos, data_size))

497

498

count = 0

499

for version_id, lines, digest \

500

in other._data.read_records_iter(records):

501

options = other._index.get_options(version_id)

502

parents = other._index.get_parents(version_id)

503

504

for parent in parents:

505

assert self.has_version(parent)

506

507

if self.factory.annotated:

508

# FIXME jrydberg: it should be possible to skip

509

# re-annotating components if we know that we are

510

# going to pull all revisions in the same order.

511

new_version_id = version_id

512

new_version_idx = self._index.num_versions()

513

if 'fulltext' in options:

514

lines = self._reannotate_fulltext(other, lines,

515

new_version_id, new_version_idx)

516

elif 'line-delta' in options:

517

lines = self._reannotate_line_delta(other, lines,

518

new_version_id, new_version_idx)

519

520

count = count + 1

521

pb.update(self.filename, count, len(version_list))

522

523

pos, size = self._data.add_record(version_id, digest, lines)

524

self._index.add_version(version_id, options, pos, size, parents)

525

526

pb.clear()

527

return count

528

529

def walk(self, version_ids):

530

"""See VersionedFile.walk."""

531

# We take the short path here, and extract all relevant texts

532

# and put them in a weave and let that do all the work. Far

533

# from optimal, but is much simpler.

534

from bzrlib.weave import Weave

535

from bzrlib.transactions import PassThroughTransaction

536

537

w = Weave(self.filename)

538

ancestry = self.get_ancestry(version_ids)

539

sorted_graph = topo_sort(self._index.get_graph())

540

version_list = [vid for vid in sorted_graph if vid in ancestry]

541

txn = PassThroughTransaction()

542

543

for version_id in version_list:

544

lines = self.get_lines(version_id)

545

w.add_lines(version_id, self.get_parents(version_id), lines)

546

547

for lineno, insert_id, dset, line in w.walk(version_ids):

548

yield lineno, insert_id, dset, line

549

550

551

class _KnitComponentFile(object):

552

"""One of the files used to implement a knit database"""

553

554

def __init__(self, transport, filename, mode, transaction):

555

self._transport = transport

556

self._filename = filename

557

self._transaction = transaction

558

self._mode = mode

559

560

def write_header(self):

561

old_len = self._transport.append(self._filename, self.HEADER)

562

if old_len != 0:

563

raise KnitCorrupt(self._filename, 'misaligned after writing header')

564

565

def check_header(self, fp):

566

line = fp.read(len(self.HEADER))

567

if line != self.HEADER:

568

raise KnitHeaderError(badline=line)

569

570

def commit(self):

571

"""Commit is a nop."""

572

573

def __repr__(self):

574

return '%s(%s)' % (self.__class__.__name__, self._filename)

575

576

577

class _KnitIndex(_KnitComponentFile):

578

"""Manages knit index file.

579

580

The index is already kept in memory and read on startup, to enable

581

fast lookups of revision information. The cursor of the index

582

file is always pointing to the end, making it easy to append

583

entries.

584

585

_cache is a cache for fast mapping from version id to a Index

586

object.

587

588

_history is a cache for fast mapping from indexes to version ids.

589

590

The index data format is dictionary compressed when it comes to

591

parent references; a index entry may only have parents that with a

592

lover index number. As a result, the index is topological sorted.

593

"""

594

595

HEADER = "# bzr knit index 7\n"

596

597

def _cache_version(self, version_id, options, pos, size, parents):

598

val = (version_id, options, pos, size, parents)

599

self._cache[version_id] = val

600

self._history.append(version_id)

601

602

def _iter_index(self, fp):

603

lines = fp.read()

604

for l in lines.splitlines(False):

605

yield l.split()

606

607

def __init__(self, transport, filename, mode, transaction):

608

_KnitComponentFile.__init__(self, transport, filename, mode, transaction)

609

self._cache = {}

610

self._history = []

611

try:

612

fp = self._transport.get(self._filename)

613

self.check_header(fp)

614

for rec in self._iter_index(fp):

615

self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),

616

[self._history[int(i)] for i in rec[4:]])

617

except NoSuchFile, e:

618

if mode != 'w':

619

raise e

620

self.write_header()

621

622

def get_graph(self):

623

graph = []

624

for version_id, index in self._cache.iteritems():

625

graph.append((version_id, index[4]))

626

return graph

627

628

def get_ancestry(self, versions):

629

"""See VersionedFile.get_ancestry."""

630

version_idxs = []

631

for version_id in versions:

632

version_idxs.append(self._history.index(version_id))

633

i = set(versions)

634

for v in xrange(max(version_idxs), 0, -1):

635

if self._history[v] in i:

636

# include all its parents

637

i.update(self._cache[self._history[v]][4])

638

return list(i)

639

640

def num_versions(self):

641

return len(self._history)

642

643

__len__ = num_versions

644

645

def get_versions(self):

646

return self._history

647

648

def idx_to_name(self, idx):

649

return self._history[idx]

650

651

def lookup(self, version_id):

652

assert version_id in self._cache

653

return self._history.index(version_id)

654

655

def add_version(self, version_id, options, pos, size, parents):

656

"""Add a version record to the index."""

657

self._cache_version(version_id, options, pos, size, parents)

658

659

content = "%s %s %s %s %s\n" % (version_id,

660

','.join(options),

661

pos,

662

size,

663

' '.join([str(self.lookup(vid)) for

664

vid in parents]))

665

self._transport.append(self._filename, content)

666

667

def has_version(self, version_id):

668

"""True if the version is in the index."""

669

return self._cache.has_key(version_id)

670

671

def get_position(self, version_id):

672

"""Return data position and size of specified version."""

673

return (self._cache[version_id][2], \

674

self._cache[version_id][3])

675

676

def get_method(self, version_id):

677

"""Return compression method of specified version."""

678

options = self._cache[version_id][1]

679

if 'fulltext' in options:

680

return 'fulltext'

681

else:

682

assert 'line-delta' in options

683

return 'line-delta'

684

685

def get_options(self, version_id):

686

return self._cache[version_id][1]

687

688

def get_parents(self, version_id):

689

"""Return parents of specified version."""

690

return self._cache[version_id][4]

691

692

def check_versions_present(self, version_ids):

693

"""Check that all specified versions are present."""

694

version_ids = set(version_ids)

695

for version_id in list(version_ids):

696

if version_id in self._cache:

697

version_ids.remove(version_id)

698

if version_ids:

699

raise RevisionNotPresent(list(version_ids)[0], self.filename)

700

701

702

class _KnitData(_KnitComponentFile):

703

"""Contents of the knit data file"""

704

705

HEADER = "# bzr knit data 7\n"

706

707

def __init__(self, transport, filename, mode, transaction):

708

_KnitComponentFile.__init__(self, transport, filename, mode, transaction)

709

self._file = None

710

self._checked = False

711

712

def _open_file(self):

713

if self._file is None:

714

try:

715

self._file = self._transport.get(self._filename)

716

except NoSuchFile:

717

pass

718

return self._file

719

720

def add_record(self, version_id, digest, lines):

721

"""Write new text record to disk. Returns the position in the

722

file where it was written."""

723

sio = StringIO()

724

data_file = GzipFile(None, mode='wb', fileobj=sio)

725

print >>data_file, "version %s %d %s" % (version_id, len(lines), digest)

726

data_file.writelines(lines)

727

print >>data_file, "end %s\n" % version_id

728

data_file.close()

729

730

content = sio.getvalue()

731

start_pos = self._transport.append(self._filename, content)

732

return start_pos, len(content)

733

734

def _parse_record(self, version_id, data):

735

df = GzipFile(mode='rb', fileobj=StringIO(data))

736

rec = df.readline().split()

737

if len(rec) != 4:

738

raise KnitCorrupt(self._filename, 'unexpected number of records')

739

if rec[1] != version_id:

740

raise KnitCorrupt(self.file.name,

741

'unexpected version, wanted %r' % version_id)

742

lines = int(rec[2])

743

record_contents = self._read_record_contents(df, lines)

744

l = df.readline()

745

if l != 'end %s\n' % version_id:

746

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

747

% (l, version_id))

748

return record_contents, rec[3]

749

750

def _read_record_contents(self, df, record_lines):

751

"""Read and return n lines from datafile."""

752

r = []

753

for i in range(record_lines):

754

r.append(df.readline())

755

return r

756

757

def read_records_iter(self, records):

758

"""Read text records from data file and yield result.

759

760

Each passed record is a tuple of (version_id, pos, len) and

761

will be read in the given order. Yields (version_id,

762

contents, digest).

763

"""

764

765

class ContinuousRange:

766

def __init__(self, rec_id, pos, size):

767

self.start_pos = pos

768

self.end_pos = pos + size

769

self.versions = [(rec_id, pos, size)]

770

771

def add(self, rec_id, pos, size):

772

if self.end_pos != pos:

773

return False

774

self.end_pos = pos + size

775

self.versions.append((rec_id, pos, size))

776

return True

777

778

def split(self, fp):

779

for rec_id, pos, size in self.versions:

780

yield rec_id, fp.read(size)

781

782

fp = self._open_file()

783

784

# Loop through all records and try to collect as large

785

# continuous region as possible to read.

786

while records:

787

record_id, pos, size = records.pop(0)

788

continuous_range = ContinuousRange(record_id, pos, size)

789

while records:

790

record_id, pos, size = records[0]

791

if continuous_range.add(record_id, pos, size):

792

del records[0]

793

else:

794

break

795

fp.seek(continuous_range.start_pos, 0)

796

for record_id, data in continuous_range.split(fp):

797

content, digest = self._parse_record(record_id, data)

798

yield record_id, content, digest

799

800

self._file = None

801

802

def read_records(self, records):

803

"""Read records into a dictionary."""

804

components = {}

805

for record_id, content, digest in self.read_records_iter(records):

806

components[record_id] = (content, digest)

807

return components

808