~bzr-pqm/bzr/bzr.dev : revision 4216.5.1

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

16

17

"""Core compression logic for compressing streams of related files."""

18

19

from itertools import izip

20

from cStringIO import StringIO

21

import struct

22

import time

23

import zlib

24

try:

25

import pylzma

26

except ImportError:

27

pylzma = None

28

29

from bzrlib import (

30

annotate,

31

debug,

32

diff,

33

errors,

34

graph as _mod_graph,

35

osutils,

36

pack,

37

patiencediff,

38

trace,

39

)

40

from bzrlib.graph import Graph

41

from bzrlib.knit import _DirectPackAccess

42

from bzrlib.osutils import (

43

contains_whitespace,

44

sha_string,

45

split_lines,

46

)

47

from bzrlib.btree_index import BTreeBuilder

48

from bzrlib.lru_cache import LRUSizeCache

49

from bzrlib.tsort import topo_sort

50

from bzrlib.versionedfile import (

51

adapter_registry,

52

AbsentContentFactory,

53

ChunkedContentFactory,

54

FulltextContentFactory,

55

VersionedFiles,

56

)

57

58

_USE_LZMA = False and (pylzma is not None)

59

_NO_LABELS = True

60

_FAST = False

61

62

# osutils.sha_string('')

63

_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'

64

65

66

def encode_base128_int(val):

67

"""Convert an integer into a 7-bit lsb encoding."""

68

bytes = []

69

count = 0

70

while val >= 0x80:

71

bytes.append(chr((val | 0x80) & 0xFF))

72

val >>= 7

73

bytes.append(chr(val))

74

return ''.join(bytes)

75

76

77

def decode_base128_int(bytes):

78

"""Decode an integer from a 7-bit lsb encoding."""

79

offset = 0

80

val = 0

81

shift = 0

82

bval = ord(bytes[offset])

83

while bval >= 0x80:

84

val |= (bval & 0x7F) << shift

85

shift += 7

86

offset += 1

87

bval = ord(bytes[offset])

88

val |= bval << shift

89

offset += 1

90

return val, offset

91

92

93

def sort_gc_optimal(parent_map):

94

"""Sort and group the keys in parent_map into groupcompress order.

95

96

groupcompress is defined (currently) as reverse-topological order, grouped by

97

the key prefix.

98

99

:return: A sorted-list of keys

100

"""

101

# groupcompress ordering is approximately reverse topological,

102

# properly grouped by file-id.

103

per_prefix_map = {}

104

for item in parent_map.iteritems():

105

key = item[0]

106

if isinstance(key, str) or len(key) == 1:

107

prefix = ''

108

else:

109

prefix = key[0]

110

try:

111

per_prefix_map[prefix].append(item)

112

except KeyError:

113

per_prefix_map[prefix] = [item]

114

115

present_keys = []

116

for prefix in sorted(per_prefix_map):

117

present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))

118

return present_keys

119

120

121

class GroupCompressBlockEntry(object):

122

"""Track the information about a single object inside a GC group.

123

124

This is generally just the dumb data structure.

125

"""

126

127

def __init__(self, key, type, sha1, start, length):

128

self.key = key

129

self.type = type # delta, fulltext, external?

130

self.sha1 = sha1 # Sha1 of content

131

self.start = start # Byte offset to start of data

132

self.length = length # Length of content

133

134

def __repr__(self):

135

return '%s(%s, %s, %s, %s, %s)' % (

136

self.__class__.__name__,

137

self.key, self.type, self.sha1, self.start, self.length

138

)

139

140

@property

141

def end(self):

142

return self.start + self.length

143

144

# The max zlib window size is 32kB, so if we set 'max_size' output of the

145

# decompressor to the requested bytes + 32kB, then we should guarantee

146

# num_bytes coming out.

147

_ZLIB_DECOMP_WINDOW = 32*1024

148

149

class GroupCompressBlock(object):

150

"""An object which maintains the internal structure of the compressed data.

151

152

This tracks the meta info (start of text, length, type, etc.)

153

"""

154

155

# Group Compress Block v1 Zlib

156

GCB_HEADER = 'gcb1z\n'

157

GCB_LZ_HEADER = 'gcb1l\n'

158

159

def __init__(self):

160

# map by key? or just order in file?

161

self._entries = {}

162

self._compressor_name = None

163

self._z_header_length = None

164

self._header_length = None

165

self._z_header = None

166

self._z_content = None

167

self._z_content_decompressor = None

168

self._z_content_length = None

169

self._content_length = None

170

self._content = None

171

172

def __len__(self):

173

return self._content_length + self._header_length

174

175

def _parse_header(self):

176

"""Parse the header part of the block."""

177

assert self._z_header is not None

178

if self._z_header == '':

179

# Nothing to process

180

self._z_header = None

181

return

182

if self._compressor_name == 'lzma':

183

header = pylzma.decompress(self._z_header)

184

else:

185

assert self._compressor_name == 'zlib'

186

header = zlib.decompress(self._z_header)

187

self._z_header = None # We have consumed the header

188

lines = header.split('\n')

189

del header

190

info_dict = {}

191

for line in lines:

192

if not line: #End of record

193

if not info_dict:

194

break

195

self.add_entry(**info_dict)

196

info_dict = {}

197

continue

198

key, value = line.split(':', 1)

199

if key == 'key':

200

value = tuple(map(intern, value.split('\x00')))

201

elif key in ('start', 'length'):

202

value = int(value)

203

elif key == 'type':

204

value = intern(value)

205

info_dict[key] = value

206

207

def _ensure_content(self, num_bytes=None):

208

"""Make sure that content has been expanded enough.

209

210

:param num_bytes: Ensure that we have extracted at least num_bytes of

211

content. If None, consume everything

212

"""

213

# TODO: If we re-use the same content block at different times during

214

# get_record_stream(), it is possible that the first pass will

215

# get inserted, triggering an extract/_ensure_content() which

216

# will get rid of _z_content. And then the next use of the block

217

# will try to access _z_content (to send it over the wire), and

218

# fail because it is already extracted. Consider never releasing

219

# _z_content because of this.

220

if num_bytes is None:

221

num_bytes = self._content_length

222

if self._content_length is not None:

223

assert num_bytes <= self._content_length

224

if self._content is None:

225

assert self._z_content is not None

226

if self._z_content == '':

227

self._content = ''

228

elif self._compressor_name == 'lzma':

229

# We don't do partial lzma decomp yet

230

self._content = pylzma.decompress(self._z_content)

231

else:

232

# Start a zlib decompressor

233

assert self._compressor_name == 'zlib'

234

if num_bytes is None:

235

self._content = zlib.decompress(self._z_content)

236

else:

237

self._z_content_decompressor = zlib.decompressobj()

238

# Seed the decompressor with the uncompressed bytes, so

239

# that the rest of the code is simplified

240

self._content = self._z_content_decompressor.decompress(

241

self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW)

242

# Any bytes remaining to be decompressed will be in the

243

# decompressors 'unconsumed_tail'

244

# Do we have enough bytes already?

245

if num_bytes is not None and len(self._content) >= num_bytes:

246

return

247

if num_bytes is None and self._z_content_decompressor is None:

248

# We must have already decompressed everything

249

return

250

# If we got this far, and don't have a decompressor, something is wrong

251

assert self._z_content_decompressor is not None

252

remaining_decomp = self._z_content_decompressor.unconsumed_tail

253

if num_bytes is None:

254

if remaining_decomp:

255

# We don't know how much is left, but we'll decompress it all

256

self._content += self._z_content_decompressor.decompress(

257

remaining_decomp)

258

# Note: There what I consider a bug in zlib.decompressobj

259

# If you pass back in the entire unconsumed_tail, only

260

# this time you don't pass a max-size, it doesn't

261

# change the unconsumed_tail back to None/''.

262

# However, we know we are done with the whole stream

263

self._z_content_decompressor = None

264

self._content_length = len(self._content)

265

else:

266

# If we have nothing left to decomp, we ran out of decomp bytes

267

assert remaining_decomp

268

needed_bytes = num_bytes - len(self._content)

269

# We always set max_size to 32kB over the minimum needed, so that

270

# zlib will give us as much as we really want.

271

# TODO: If this isn't good enough, we could make a loop here,

272

# that keeps expanding the request until we get enough

273

self._content += self._z_content_decompressor.decompress(

274

remaining_decomp, needed_bytes + _ZLIB_DECOMP_WINDOW)

275

assert len(self._content) >= num_bytes

276

if not self._z_content_decompressor.unconsumed_tail:

277

# The stream is finished

278

self._z_content_decompressor = None

279

280

def _parse_bytes(self, bytes):

281

"""Read the various lengths from the header.

282

283

This also populates the various 'compressed' buffers.

284

285

:return: The position in bytes just after the last newline

286

"""

287

# At present, there are 4 lengths to be read, we have 2 integers for

288

# the length of the compressed and uncompressed header, and 2 integers

289

# for the compressed and uncompressed content

290

# 14 bytes can represent > 1TB, so to avoid checking too far, cap the

291

# search to 14 bytes.

292

pos = bytes.index('\n', 6, 20)

293

self._z_header_length = int(bytes[6:pos])

294

pos += 1

295

pos2 = bytes.index('\n', pos, pos + 14)

296

self._header_length = int(bytes[pos:pos2])

297

end_of_z_lengths = pos2

298

pos2 += 1

299

# Older versions don't have the content lengths, if we want to preserve

300

# backwards compatibility, we could try/except over these, and allow

301

# them to be skipped

302

try:

303

pos = bytes.index('\n', pos2, pos2 + 14)

304

self._z_content_length = int(bytes[pos2:pos])

305

pos += 1

306

pos2 = bytes.index('\n', pos, pos + 14)

307

self._content_length = int(bytes[pos:pos2])

308

pos = pos2 + 1

309

assert len(bytes) == (pos + self._z_header_length +

310

self._z_content_length)

311

pos2 = pos + self._z_header_length

312

self._z_header = bytes[pos:pos2]

313

self._z_content = bytes[pos2:]

314

assert len(self._z_content) == self._z_content_length

315

except ValueError:

316

# This is the older form, which did not encode its content length

317

pos = end_of_z_lengths + 1

318

pos2 = pos + self._z_header_length

319

self._z_header = bytes[pos:pos2]

320

self._z_content = bytes[pos2:]

321

self._z_content_length = len(self._z_content)

322

323

@classmethod

324

def from_bytes(cls, bytes):

325

out = cls()

326

if bytes[:6] not in (cls.GCB_HEADER, cls.GCB_LZ_HEADER):

327

raise ValueError('bytes did not start with %r' % (cls.GCB_HEADER,))

328

if bytes[4] == 'z':

329

out._compressor_name = 'zlib'

330

elif bytes[4] == 'l':

331

out._compressor_name = 'lzma'

332

else:

333

raise ValueError('unknown compressor: %r' % (bytes,))

334

out._parse_bytes(bytes)

335

if not _NO_LABELS:

336

out._parse_header()

337

return out

338

339

def extract(self, key, start, end, sha1=None):

340

"""Extract the text for a specific key.

341

342

:param key: The label used for this content

343

:param sha1: TODO (should we validate only when sha1 is supplied?)

344

:return: The bytes for the content

345

"""

346

if start == end == 0:

347

return ''

348

self._ensure_content(end)

349

# The bytes are 'f' or 'd' for the type, then a variable-length

350

# base128 integer for the content size, then the actual content

351

# We know that the variable-length integer won't be longer than 5

352

# bytes (it takes 5 bytes to encode 2^32)

353

c = self._content[start]

354

if c == 'f':

355

type = 'fulltext'

356

else:

357

if c != 'd':

358

raise ValueError('Unknown content control code: %s'

359

% (c,))

360

type = 'delta'

361

content_len, len_len = decode_base128_int(

362

self._content[start + 1:start + 6])

363

content_start = start + 1 + len_len

364

if end != content_start + content_len:

365

raise ValueError('end != len according to field header'

366

' %s != %s' % (end, content_start + content_len))

367

content = self._content[content_start:end]

368

if c == 'f':

369

bytes = content

370

elif c == 'd':

371

bytes = _groupcompress_pyx.apply_delta(self._content, content)

372

return bytes

373

374

def add_entry(self, key, type, sha1, start, length):

375

"""Add new meta info about an entry.

376

377

:param key: The key for the new content

378

:param type: Whether this is a delta or fulltext entry (external?)

379

:param sha1: sha1sum of the fulltext of this entry

380

:param start: where the encoded bytes start

381

:param length: total number of bytes in the encoded form

382

:return: The entry?

383

"""

384

entry = GroupCompressBlockEntry(key, type, sha1, start, length)

385

if key in self._entries:

386

raise ValueError('Duplicate key found: %s' % (key,))

387

self._entries[key] = entry

388

return entry

389

390

def set_content(self, content):

391

"""Set the content of this block."""

392

self._content_length = len(content)

393

self._content = content

394

self._z_content = None

395

self._z_header_length = None

396

397

def to_bytes(self):

398

"""Encode the information into a byte stream."""

399

compress = zlib.compress

400

if _USE_LZMA:

401

compress = pylzma.compress

402

chunks = []

403

for key in sorted(self._entries):

404

entry = self._entries[key]

405

chunk = ('key:%s\n'

406

'sha1:%s\n'

407

'type:%s\n'

408

'start:%s\n'

409

'length:%s\n'

410

'\n'

411

) % ('\x00'.join(entry.key),

412

entry.sha1,

413

entry.type,

414

entry.start,

415

entry.length,

416

)

417

chunks.append(chunk)

418

bytes = ''.join(chunks)

419

info_len = len(bytes)

420

z_header_bytes = compress(bytes)

421

del bytes, chunks

422

z_header_len = len(z_header_bytes)

423

# TODO: we may want to have the header compressed in the same chain

424

# as the data, or we may not, evaulate it

425

# having them compressed together is probably a win for

426

# revisions and the 'inv' portion of chk inventories. As the

427

# label in the header is duplicated in the text.

428

# For chk pages and real bytes, I would guess this is not

429

# true.

430

if _NO_LABELS:

431

z_header_bytes = ''

432

z_header_len = 0

433

info_len = 0

434

if self._z_content is not None:

435

content_len = self._content_length

436

z_content_len = self._z_content_length

437

z_content_bytes = self._z_content

438

else:

439

assert self._content is not None

440

content_len = self._content_length

441

z_content_bytes = compress(self._content)

442

self._z_content = z_content_bytes

443

z_content_len = len(z_content_bytes)

444

self._z_content_length = z_content_len

445

if _USE_LZMA:

446

header = self.GCB_LZ_HEADER

447

else:

448

header = self.GCB_HEADER

449

chunks = [header,

450

'%d\n%d\n%d\n%d\n' % (z_header_len, info_len,

451

z_content_len, content_len)

452

]

453

chunks.append(z_header_bytes)

454

chunks.append(z_content_bytes)

455

return ''.join(chunks)

456

457

458

class _LazyGroupCompressFactory(object):

459

"""Yield content from a GroupCompressBlock on demand."""

460

461

def __init__(self, key, parents, manager, start, end, first):

462

"""Create a _LazyGroupCompressFactory

463

464

:param key: The key of just this record

465

:param parents: The parents of this key (possibly None)

466

:param gc_block: A GroupCompressBlock object

467

:param start: Offset of the first byte for this record in the

468

uncompressd content

469

:param end: Offset of the byte just after the end of this record

470

(ie, bytes = content[start:end])

471

:param first: Is this the first Factory for the given block?

472

"""

473

self.key = key

474

self.parents = parents

475

self.sha1 = None

476

# Note: This attribute coupled with Manager._factories creates a

477

# reference cycle. Perhaps we would rather use a weakref(), or

478

# find an appropriate time to release the ref. After the first

479

# get_bytes_as call? After Manager.get_record_stream() returns

480

# the object?

481

self._manager = manager

482

self._bytes = None

483

self.storage_kind = 'groupcompress-block'

484

if not first:

485

self.storage_kind = 'groupcompress-block-ref'

486

self._first = first

487

self._start = start

488

self._end = end

489

490

def __repr__(self):

491

return '%s(%s, first=%s)' % (self.__class__.__name__,

492

self.key, self._first)

493

494

def get_bytes_as(self, storage_kind):

495

if storage_kind == self.storage_kind:

496

if self._first:

497

# wire bytes, something...

498

return self._manager._wire_bytes()

499

else:

500

return ''

501

if storage_kind in ('fulltext', 'chunked'):

502

if self._bytes is None:

503

# Grab and cache the raw bytes for this entry

504

# and break the ref-cycle with _manager since we don't need it

505

# anymore

506

self._manager._prepare_for_extract()

507

block = self._manager._block

508

self._bytes = block.extract(self.key, self._start, self._end)

509

# XXX: It seems the smart fetch extracts inventories and chk

510

# pages as fulltexts to find the next chk pages, but then

511

# passes them down to be inserted as a

512

# groupcompress-block, so this is not safe to do. Perhaps

513

# we could just change the storage kind to "fulltext" at

514

# that point?

515

# self._manager = None

516

if storage_kind == 'fulltext':

517

return self._bytes

518

else:

519

return [self._bytes]

520

raise errors.UnavailableRepresentation(self.key, storage_kind,

521

self.storage_kind)

522

523

524

class _LazyGroupContentManager(object):

525

"""This manages a group of _LazyGroupCompressFactory objects."""

526

527

def __init__(self, block):

528

self._block = block

529

# We need to preserve the ordering

530

self._factories = []

531

self._last_byte = 0

532

533

def add_factory(self, key, parents, start, end):

534

if not self._factories:

535

first = True

536

else:

537

first = False

538

# Note that this creates a reference cycle....

539

factory = _LazyGroupCompressFactory(key, parents, self,

540

start, end, first=first)

541

# max() works here, but as a function call, doing a compare seems to be

542

# significantly faster, timeit says 250ms for max() and 100ms for the

543

# comparison

544

if end > self._last_byte:

545

self._last_byte = end

546

self._factories.append(factory)

547

548

def get_record_stream(self):

549

"""Get a record for all keys added so far."""

550

for factory in self._factories:

551

yield factory

552

# Break the ref-cycle

553

factory._bytes = None

554

# XXX: this is not safe, the smart fetch code requests the content

555

# as both a 'fulltext', and then later on as a

556

# groupcompress-block. The iter_interesting_nodes code also is

557

# still buffering multiple records and returning them later.

558

# So that code would need to be updated to either re-fetch the

559

# original object, or buffer it somehow.

560

# factory._manager = None

561

# TODO: Consider setting self._factories = None after the above loop,

562

# as it will break the reference cycle

563

564

def _trim_block(self, last_byte):

565

"""Create a new GroupCompressBlock, with just some of the content."""

566

# None of the factories need to be adjusted, because the content is

567

# located in an identical place. Just that some of the unreferenced

568

# trailing bytes are stripped

569

trace.mutter('stripping trailing bytes from groupcompress block'

570

' %d => %d', self._block._content_length, last_byte)

571

new_block = GroupCompressBlock()

572

self._block._ensure_content(last_byte)

573

new_block.set_content(self._block._content[:last_byte])

574

self._block = new_block

575

576

def _rebuild_block(self):

577

"""Create a new GroupCompressBlock with only the referenced texts."""

578

compressor = GroupCompressor()

579

tstart = time.time()

580

old_length = self._block._content_length

581

end_point = 0

582

for factory in self._factories:

583

bytes = factory.get_bytes_as('fulltext')

584

(found_sha1, start_point, end_point, type,

585

length) = compressor.compress(factory.key, bytes, factory.sha1)

586

# Now update this factory with the new offsets, etc

587

factory.sha1 = found_sha1

588

factory._start = start_point

589

factory._end = end_point

590

self._last_byte = end_point

591

new_block = compressor.flush()

592

# TODO: Should we check that new_block really *is* smaller than the old

593

# block? It seems hard to come up with a method that it would

594

# expand, since we do full compression again. Perhaps based on a

595

# request that ends up poorly ordered?

596

delta = time.time() - tstart

597

self._block = new_block

598

trace.mutter('creating new compressed block on-the-fly in %.3fs'

599

' %d bytes => %d bytes', delta, old_length,

600

self._block._content_length)

601

602

def _prepare_for_extract(self):

603

"""A _LazyGroupCompressFactory is about to extract to fulltext."""

604

# We expect that if one child is going to fulltext, all will be. This

605

# helps prevent all of them from extracting a small amount at a time.

606

# Which in itself isn't terribly expensive, but resizing 2MB 32kB at a

607

# time (self._block._content) is a little expensive.

608

self._block._ensure_content(self._last_byte)

609

610

def _check_rebuild_block(self):

611

"""Check to see if our block should be repacked."""

612

total_bytes_used = 0

613

last_byte_used = 0

614

for factory in self._factories:

615

total_bytes_used += factory._end - factory._start

616

last_byte_used = max(last_byte_used, factory._end)

617

# If we are using most of the bytes from the block, we have nothing

618

# else to check (currently more that 1/2)

619

if total_bytes_used * 2 >= self._block._content_length:

620

return

621

# Can we just strip off the trailing bytes? If we are going to be

622

# transmitting more than 50% of the front of the content, go ahead

623

if total_bytes_used * 2 > last_byte_used:

624

self._trim_block(last_byte_used)

625

return

626

627

# We are using a small amount of the data, and it isn't just packed

628

# nicely at the front, so rebuild the content.

629

# Note: This would be *nicer* as a strip-data-from-group, rather than

630

# building it up again from scratch

631

# It might be reasonable to consider the fulltext sizes for

632

# different bits when deciding this, too. As you may have a small

633

# fulltext, and a trivial delta, and you are just trading around

634

# for another fulltext. If we do a simple 'prune' you may end up

635

# expanding many deltas into fulltexts, as well.

636

# If we build a cheap enough 'strip', then we could try a strip,

637

# if that expands the content, we then rebuild.

638

self._rebuild_block()

639

640

def _wire_bytes(self):

641

"""Return a byte stream suitable for transmitting over the wire."""

642

self._check_rebuild_block()

643

# The outer block starts with:

644

# 'groupcompress-block\n'

645

# <length of compressed key info>\n

646

# <length of uncompressed info>\n

647

# <length of gc block>\n

648

# <header bytes>

649

# <gc-block>

650

lines = ['groupcompress-block\n']

651

# The minimal info we need is the key, the start offset, and the

652

# parents. The length and type are encoded in the record itself.

653

# However, passing in the other bits makes it easier. The list of

654

# keys, and the start offset, the length

655

# 1 line key

656

# 1 line with parents, '' for ()

657

# 1 line for start offset

658

# 1 line for end byte

659

header_lines = []

660

for factory in self._factories:

661

key_bytes = '\x00'.join(factory.key)

662

parents = factory.parents

663

if parents is None:

664

parent_bytes = 'None:'

665

else:

666

parent_bytes = '\t'.join('\x00'.join(key) for key in parents)

667

record_header = '%s\n%s\n%d\n%d\n' % (

668

key_bytes, parent_bytes, factory._start, factory._end)

669

header_lines.append(record_header)

670

header_bytes = ''.join(header_lines)

671

del header_lines

672

header_bytes_len = len(header_bytes)

673

z_header_bytes = zlib.compress(header_bytes)

674

del header_bytes

675

z_header_bytes_len = len(z_header_bytes)

676

block_bytes = self._block.to_bytes()

677

lines.append('%d\n%d\n%d\n' % (z_header_bytes_len, header_bytes_len,

678

len(block_bytes)))

679

lines.append(z_header_bytes)

680

lines.append(block_bytes)

681

del z_header_bytes, block_bytes

682

return ''.join(lines)

683

684

@classmethod

685

def from_bytes(cls, bytes):

686

# TODO: This does extra string copying, probably better to do it a

687

# different way

688

(storage_kind, z_header_len, header_len,

689

block_len, rest) = bytes.split('\n', 4)

690

del bytes

691

if storage_kind != 'groupcompress-block':

692

raise ValueError('Unknown storage kind: %s' % (storage_kind,))

693

z_header_len = int(z_header_len)

694

if len(rest) < z_header_len:

695

raise ValueError('Compressed header len shorter than all bytes')

696

z_header = rest[:z_header_len]

697

header_len = int(header_len)

698

header = zlib.decompress(z_header)

699

if len(header) != header_len:

700

raise ValueError('invalid length for decompressed bytes')

701

del z_header

702

block_len = int(block_len)

703

if len(rest) != z_header_len + block_len:

704

raise ValueError('Invalid length for block')

705

block_bytes = rest[z_header_len:]

706

del rest

707

# So now we have a valid GCB, we just need to parse the factories that

708

# were sent to us

709

header_lines = header.split('\n')

710

del header

711

last = header_lines.pop()

712

if last != '':

713

raise ValueError('header lines did not end with a trailing'

714

' newline')

715

if len(header_lines) % 4 != 0:

716

raise ValueError('The header was not an even multiple of 4 lines')

717

block = GroupCompressBlock.from_bytes(block_bytes)

718

del block_bytes

719

result = cls(block)

720

for start in xrange(0, len(header_lines), 4):

721

# intern()?

722

key = tuple(header_lines[start].split('\x00'))

723

parents_line = header_lines[start+1]

724

if parents_line == 'None:':

725

parents = None

726

else:

727

parents = tuple([tuple(segment.split('\x00'))

728

for segment in parents_line.split('\t')

729

if segment])

730

start_offset = int(header_lines[start+2])

731

end_offset = int(header_lines[start+3])

732

result.add_factory(key, parents, start_offset, end_offset)

733

return result

734

735

736

def network_block_to_records(storage_kind, bytes, line_end):

737

if storage_kind != 'groupcompress-block':

738

raise ValueError('Unknown storage kind: %s' % (storage_kind,))

739

manager = _LazyGroupContentManager.from_bytes(bytes)

740

return manager.get_record_stream()

741

742

743

class GroupCompressor(object):

744

"""Produce a serialised group of compressed texts.

745

746

It contains code very similar to SequenceMatcher because of having a similar

747

task. However some key differences apply:

748

- there is no junk, we want a minimal edit not a human readable diff.

749

- we don't filter very common lines (because we don't know where a good

750

range will start, and after the first text we want to be emitting minmal

751

edits only.

752

- we chain the left side, not the right side

753

- we incrementally update the adjacency matrix as new lines are provided.

754

- we look for matches in all of the left side, so the routine which does

755

the analagous task of find_longest_match does not need to filter on the

756

left side.

757

"""

758

759

def __init__(self):

760

"""Create a GroupCompressor."""

761

# Consider seeding the lines with some sort of GC Start flag, or

762

# putting it as part of the output stream, rather than in the

763

# compressed bytes.

764

self.lines = []

765

self.endpoint = 0

766

self.input_bytes = 0

767

self.num_keys = 0

768

self.labels_deltas = {}

769

self._last = None

770

self._delta_index = _groupcompress_pyx.DeltaIndex()

771

self._block = GroupCompressBlock()

772

773

def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):

774

"""Compress lines with label key.

775

776

:param key: A key tuple. It is stored in the output

777

for identification of the text during decompression. If the last

778

element is 'None' it is replaced with the sha1 of the text -

779

e.g. sha1:xxxxxxx.

780

:param bytes: The bytes to be compressed

781

:param expected_sha: If non-None, the sha the lines are believed to

782

have. During compression the sha is calculated; a mismatch will

783

cause an error.

784

:param nostore_sha: If the computed sha1 sum matches, we will raise

785

ExistingContent rather than adding the text.

786

:param soft: Do a 'soft' compression. This means that we require larger

787

ranges to match to be considered for a copy command.

788

:return: The sha1 of lines, and the number of bytes accumulated in

789

the group output so far.

790

:seealso VersionedFiles.add_lines:

791

"""

792

if not bytes: # empty, like a dir entry, etc

793

if nostore_sha == _null_sha1:

794

raise errors.ExistingContent()

795

self._block.add_entry(key, type='empty',

796

sha1=None, start=0,

797

length=0)

798

return _null_sha1, 0, 0, 'fulltext', 0

799

# we assume someone knew what they were doing when they passed it in

800

if expected_sha is not None:

801

sha1 = expected_sha

802

else:

803

sha1 = osutils.sha_string(bytes)

804

if nostore_sha is not None:

805

if sha1 == nostore_sha:

806

raise errors.ExistingContent()

807

if key[-1] is None:

808

key = key[:-1] + ('sha1:' + sha1,)

809

input_len = len(bytes)

810

# By having action/label/sha1/len, we can parse the group if the index

811

# was ever destroyed, we have the key in 'label', we know the final

812

# bytes are valid from sha1, and we know where to find the end of this

813

# record because of 'len'. (the delta record itself will store the

814

# total length for the expanded record)

815

# 'len: %d\n' costs approximately 1% increase in total data

816

# Having the labels at all costs us 9-10% increase, 38% increase for

817

# inventory pages, and 5.8% increase for text pages

818

# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]

819

if self._delta_index._source_offset != self.endpoint:

820

raise AssertionError('_source_offset != endpoint'

821

' somehow the DeltaIndex got out of sync with'

822

' the output lines')

823

max_delta_size = len(bytes) / 2

824

delta = self._delta_index.make_delta(bytes, max_delta_size)

825

if (delta is None):

826

type = 'fulltext'

827

enc_length = encode_base128_int(len(bytes))

828

len_mini_header = 1 + len(enc_length)

829

length = len(bytes) + len_mini_header

830

self._delta_index.add_source(bytes, len_mini_header)

831

new_chunks = ['f', enc_length, bytes]

832

else:

833

type = 'delta'

834

enc_length = encode_base128_int(len(delta))

835

len_mini_header = 1 + len(enc_length)

836

length = len(delta) + len_mini_header

837

new_chunks = ['d', enc_length, delta]

838

if _FAST:

839

self._delta_index._source_offset += length

840

else:

841

self._delta_index.add_delta_source(delta, len_mini_header)

842

self._block.add_entry(key, type=type, sha1=sha1,

843

start=self.endpoint, length=length)

844

start = self.endpoint

845

delta_start = (self.endpoint, len(self.lines))

846

self.num_keys += 1

847

self.output_chunks(new_chunks)

848

self.input_bytes += input_len

849

delta_end = (self.endpoint, len(self.lines))

850

self.labels_deltas[key] = (delta_start, delta_end)

851

if not self._delta_index._source_offset == self.endpoint:

852

raise AssertionError('the delta index is out of sync'

853

'with the output lines %s != %s'

854

% (self._delta_index._source_offset, self.endpoint))

855

return sha1, start, self.endpoint, type, length

856

857

def extract(self, key):

858

"""Extract a key previously added to the compressor.

859

860

:param key: The key to extract.

861

:return: An iterable over bytes and the sha1.

862

"""

863

delta_details = self.labels_deltas[key]

864

delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]

865

stored_bytes = ''.join(delta_chunks)

866

# TODO: Fix this, we shouldn't really be peeking here

867

entry = self._block._entries[key]

868

if entry.type == 'fulltext':

869

if stored_bytes[0] != 'f':

870

raise ValueError('Index claimed fulltext, but stored bytes'

871

' indicate %s' % (stored_bytes[0],))

872

fulltext_len, offset = decode_base128_int(stored_bytes[1:10])

873

if fulltext_len + 1 + offset != len(stored_bytes):

874

raise ValueError('Index claimed fulltext len, but stored bytes'

875

' claim %s != %s'

876

% (len(stored_bytes),

877

fulltext_len + 1 + offset))

878

bytes = stored_bytes[offset + 1:]

879

else:

880

if entry.type != 'delta':

881

raise ValueError('Unknown entry type: %s' % (entry.type,))

882

# XXX: This is inefficient at best

883

source = ''.join(self.lines)

884

if stored_bytes[0] != 'd':

885

raise ValueError('Entry type claims delta, bytes claim %s'

886

% (stored_bytes[0],))

887

delta_len, offset = decode_base128_int(stored_bytes[1:10])

888

if delta_len + 1 + offset != len(stored_bytes):

889

raise ValueError('Index claimed delta len, but stored bytes'

890

' claim %s != %s'

891

% (len(stored_bytes),

892

delta_len + 1 + offset))

893

bytes = _groupcompress_pyx.apply_delta(source,

894

stored_bytes[offset + 1:])

895

bytes_sha1 = sha_string(bytes)

896

if entry.sha1 != bytes_sha1:

897

raise ValueError('Recorded sha1 != measured %s != %s'

898

% (entry.sha1, bytes_sha1))

899

return bytes, entry.sha1

900

901

def flush(self):

902

"""Finish this group, creating a formatted stream."""

903

content = ''.join(self.lines)

904

self.lines = None

905

self._block.set_content(content)

906

return self._block

907

908

def output_chunks(self, new_chunks):

909

"""Output some chunks.

910

911

:param new_chunks: The chunks to output.

912

"""

913

self._last = (len(self.lines), self.endpoint)

914

endpoint = self.endpoint

915

self.lines.extend(new_chunks)

916

endpoint += sum(map(len, new_chunks))

917

self.endpoint = endpoint

918

919

def pop_last(self):

920

"""Call this if you want to 'revoke' the last compression.

921

922

After this, the data structures will be rolled back, but you cannot do

923

more compression.

924

"""

925

self._delta_index = None

926

del self.lines[self._last[0]:]

927

self.endpoint = self._last[1]

928

self._last = None

929

930

def ratio(self):

931

"""Return the overall compression ratio."""

932

return float(self.input_bytes) / float(self.endpoint)

933

934

935

def make_pack_factory(graph, delta, keylength):

936

"""Create a factory for creating a pack based groupcompress.

937

938

This is only functional enough to run interface tests, it doesn't try to

939

provide a full pack environment.

940

941

:param graph: Store a graph.

942

:param delta: Delta compress contents.

943

:param keylength: How long should keys be.

944

"""

945

def factory(transport):

946

parents = graph

947

ref_length = 0

948

if graph:

949

ref_length = 1

950

graph_index = BTreeBuilder(reference_lists=ref_length,

951

key_elements=keylength)

952

stream = transport.open_write_stream('newpack')

953

writer = pack.ContainerWriter(stream.write)

954

writer.begin()

955

index = _GCGraphIndex(graph_index, lambda:True, parents=parents,

956

add_callback=graph_index.add_nodes)

957

access = _DirectPackAccess({})

958

access.set_writer(writer, graph_index, (transport, 'newpack'))

959

result = GroupCompressVersionedFiles(index, access, delta)

960

result.stream = stream

961

result.writer = writer

962

return result

963

return factory

964

965

966

def cleanup_pack_group(versioned_files):

967

versioned_files.writer.end()

968

versioned_files.stream.close()

969

970

971

class GroupCompressVersionedFiles(VersionedFiles):

972

"""A group-compress based VersionedFiles implementation."""

973

974

def __init__(self, index, access, delta=True):

975

"""Create a GroupCompressVersionedFiles object.

976

977

:param index: The index object storing access and graph data.

978

:param access: The access object storing raw data.

979

:param delta: Whether to delta compress or just entropy compress.

980

"""

981

self._index = index

982

self._access = access

983

self._delta = delta

984

self._unadded_refs = {}

985

self._group_cache = LRUSizeCache(max_size=50*1024*1024)

986

self._fallback_vfs = []

987

988

def add_lines(self, key, parents, lines, parent_texts=None,

989

left_matching_blocks=None, nostore_sha=None, random_id=False,

990

check_content=True):

991

"""Add a text to the store.

992

993

:param key: The key tuple of the text to add.

994

:param parents: The parents key tuples of the text to add.

995

:param lines: A list of lines. Each line must be a bytestring. And all

996

of them except the last must be terminated with \n and contain no

997

other \n's. The last line may either contain no \n's or a single

998

terminating \n. If the lines list does meet this constraint the add

999

routine may error or may succeed - but you will be unable to read

1000

the data back accurately. (Checking the lines have been split

1001

correctly is expensive and extremely unlikely to catch bugs so it

1002

is not done at runtime unless check_content is True.)

1003

:param parent_texts: An optional dictionary containing the opaque

1004

representations of some or all of the parents of version_id to

1005

allow delta optimisations. VERY IMPORTANT: the texts must be those

1006

returned by add_lines or data corruption can be caused.

1007

:param left_matching_blocks: a hint about which areas are common

1008

between the text and its left-hand-parent. The format is

1009

the SequenceMatcher.get_matching_blocks format.

1010

:param nostore_sha: Raise ExistingContent and do not add the lines to

1011

the versioned file if the digest of the lines matches this.

1012

:param random_id: If True a random id has been selected rather than

1013

an id determined by some deterministic process such as a converter

1014

from a foreign VCS. When True the backend may choose not to check

1015

for uniqueness of the resulting key within the versioned file, so

1016

this should only be done when the result is expected to be unique

1017

anyway.

1018

:param check_content: If True, the lines supplied are verified to be

1019

bytestrings that are correctly formed lines.

1020

:return: The text sha1, the number of bytes in the text, and an opaque

1021

representation of the inserted version which can be provided

1022

back to future add_lines calls in the parent_texts dictionary.

1023

"""

1024

self._index._check_write_ok()

1025

self._check_add(key, lines, random_id, check_content)

1026

if parents is None:

1027

# The caller might pass None if there is no graph data, but kndx

1028

# indexes can't directly store that, so we give them

1029

# an empty tuple instead.

1030

parents = ()

1031

# double handling for now. Make it work until then.

1032

length = sum(map(len, lines))

1033

record = ChunkedContentFactory(key, parents, None, lines)

1034

sha1 = list(self._insert_record_stream([record], random_id=random_id,

1035

nostore_sha=nostore_sha))[0]

1036

return sha1, length, None

1037

1038

def add_fallback_versioned_files(self, a_versioned_files):

1039

"""Add a source of texts for texts not present in this knit.

1040

1041

:param a_versioned_files: A VersionedFiles object.

1042

"""

1043

self._fallback_vfs.append(a_versioned_files)

1044

1045

def annotate(self, key):

1046

"""See VersionedFiles.annotate."""

1047

graph = Graph(self)

1048

parent_map = self.get_parent_map([key])

1049

if not parent_map:

1050

raise errors.RevisionNotPresent(key, self)

1051

if parent_map[key] is not None:

1052

search = graph._make_breadth_first_searcher([key])

1053

keys = set()

1054

while True:

1055

try:

1056

present, ghosts = search.next_with_ghosts()

1057

except StopIteration:

1058

break

1059

keys.update(present)

1060

parent_map = self.get_parent_map(keys)

1061

else:

1062

keys = [key]

1063

parent_map = {key:()}

1064

head_cache = _mod_graph.FrozenHeadsCache(graph)

1065

parent_cache = {}

1066

reannotate = annotate.reannotate

1067

for record in self.get_record_stream(keys, 'topological', True):

1068

key = record.key

1069

chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))

1070

parent_lines = [parent_cache[parent] for parent in parent_map[key]]

1071

parent_cache[key] = list(

1072

reannotate(parent_lines, chunks, key, None, head_cache))

1073

return parent_cache[key]

1074

1075

def check(self, progress_bar=None):

1076

"""See VersionedFiles.check()."""

1077

keys = self.keys()

1078

for record in self.get_record_stream(keys, 'unordered', True):

1079

record.get_bytes_as('fulltext')

1080

1081

def _check_add(self, key, lines, random_id, check_content):

1082

"""check that version_id and lines are safe to add."""

1083

version_id = key[-1]

1084

if version_id is not None:

1085

if contains_whitespace(version_id):

1086

raise errors.InvalidRevisionId(version_id, self)

1087

self.check_not_reserved_id(version_id)

1088

# TODO: If random_id==False and the key is already present, we should

1089

# probably check that the existing content is identical to what is

1090

# being inserted, and otherwise raise an exception. This would make

1091

# the bundle code simpler.

1092

if check_content:

1093

self._check_lines_not_unicode(lines)

1094

self._check_lines_are_lines(lines)

1095

1096

def get_parent_map(self, keys):

1097

"""Get a map of the graph parents of keys.

1098

1099

:param keys: The keys to look up parents for.

1100

:return: A mapping from keys to parents. Absent keys are absent from

1101

the mapping.

1102

"""

1103

return self._get_parent_map_with_sources(keys)[0]

1104

1105

def _get_parent_map_with_sources(self, keys):

1106

"""Get a map of the parents of keys.

1107

1108

:param keys: The keys to look up parents for.

1109

:return: A tuple. The first element is a mapping from keys to parents.

1110

Absent keys are absent from the mapping. The second element is a

1111

list with the locations each key was found in. The first element

1112

is the in-this-knit parents, the second the first fallback source,

1113

and so on.

1114

"""

1115

result = {}

1116

sources = [self._index] + self._fallback_vfs

1117

source_results = []

1118

missing = set(keys)

1119

for source in sources:

1120

if not missing:

1121

break

1122

new_result = source.get_parent_map(missing)

1123

source_results.append(new_result)

1124

result.update(new_result)

1125

missing.difference_update(set(new_result))

1126

return result, source_results

1127

1128

def _get_block(self, index_memo):

1129

read_memo = index_memo[0:3]

1130

# get the group:

1131

try:

1132

block = self._group_cache[read_memo]

1133

except KeyError:

1134

# read the group

1135

zdata = self._access.get_raw_records([read_memo]).next()

1136

# decompress - whole thing - this is not a bug, as it

1137

# permits caching. We might want to store the partially

1138

# decompresed group and decompress object, so that recent

1139

# texts are not penalised by big groups.

1140

block = GroupCompressBlock.from_bytes(zdata)

1141

self._group_cache[read_memo] = block

1142

# cheapo debugging:

1143

# print len(zdata), len(plain)

1144

# parse - requires split_lines, better to have byte offsets

1145

# here (but not by much - we only split the region for the

1146

# recipe, and we often want to end up with lines anyway.

1147

return block

1148

1149

def get_missing_compression_parent_keys(self):

1150

"""Return the keys of missing compression parents.

1151

1152

Missing compression parents occur when a record stream was missing

1153

basis texts, or a index was scanned that had missing basis texts.

1154

"""

1155

# GroupCompress cannot currently reference texts that are not in the

1156

# group, so this is valid for now

1157

return frozenset()

1158

1159

def get_record_stream(self, keys, ordering, include_delta_closure):

1160

"""Get a stream of records for keys.

1161

1162

:param keys: The keys to include.

1163

:param ordering: Either 'unordered' or 'topological'. A topologically

1164

sorted stream has compression parents strictly before their

1165

children.

1166

:param include_delta_closure: If True then the closure across any

1167

compression parents will be included (in the opaque data).

1168

:return: An iterator of ContentFactory objects, each of which is only

1169

valid until the iterator is advanced.

1170

"""

1171

# keys might be a generator

1172

orig_keys = list(keys)

1173

keys = set(keys)

1174

if not keys:

1175

return

1176

if (not self._index.has_graph

1177

and ordering in ('topological', 'groupcompress')):

1178

# Cannot topological order when no graph has been stored.

1179

# but we allow 'as-requested' or 'unordered'

1180

ordering = 'unordered'

1181

1182

remaining_keys = keys

1183

while True:

1184

try:

1185

keys = set(remaining_keys)

1186

for content_factory in self._get_remaining_record_stream(keys,

1187

orig_keys, ordering, include_delta_closure):

1188

remaining_keys.discard(content_factory.key)

1189

yield content_factory

1190

return

1191

except errors.RetryWithNewPacks, e:

1192

self._access.reload_or_raise(e)

1193

1194

def _find_from_fallback(self, missing):

1195

"""Find whatever keys you can from the fallbacks.

1196

1197

:param missing: A set of missing keys. This set will be mutated as keys

1198

are found from a fallback_vfs

1199

:return: (parent_map, key_to_source_map, source_results)

1200

parent_map the overall key => parent_keys

1201

key_to_source_map a dict from {key: source}

1202

source_results a list of (source: keys)

1203

"""

1204

parent_map = {}

1205

key_to_source_map = {}

1206

source_results = []

1207

for source in self._fallback_vfs:

1208

if not missing:

1209

break

1210

source_parents = source.get_parent_map(missing)

1211

parent_map.update(source_parents)

1212

source_parents = list(source_parents)

1213

source_results.append((source, source_parents))

1214

key_to_source_map.update((key, source) for key in source_parents)

1215

missing.difference_update(source_parents)

1216

return parent_map, key_to_source_map, source_results

1217

1218

def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map):

1219

"""Get the (source, [keys]) list.

1220

1221

The returned objects should be in the order defined by 'ordering',

1222

which can weave between different sources.

1223

:param ordering: Must be one of 'topological' or 'groupcompress'

1224

:return: List of [(source, [keys])] tuples, such that all keys are in

1225

the defined order, regardless of source.

1226

"""

1227

if ordering == 'topological':

1228

present_keys = topo_sort(parent_map)

1229

else:

1230

# ordering == 'groupcompress'

1231

# XXX: This only optimizes for the target ordering. We may need

1232

# to balance that with the time it takes to extract

1233

# ordering, by somehow grouping based on

1234

# locations[key][0:3]

1235

present_keys = sort_gc_optimal(parent_map)

1236

# Now group by source:

1237

source_keys = []

1238

current_source = None

1239

for key in present_keys:

1240

source = key_to_source_map.get(key, self)

1241

if source is not current_source:

1242

source_keys.append((source, []))

1243

current_source = source

1244

source_keys[-1][1].append(key)

1245

return source_keys

1246

1247

def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys,

1248

key_to_source_map):

1249

source_keys = []

1250

current_source = None

1251

for key in orig_keys:

1252

if key in locations or key in unadded_keys:

1253

source = self

1254

elif key in key_to_source_map:

1255

source = key_to_source_map[key]

1256

else: # absent

1257

continue

1258

if source is not current_source:

1259

source_keys.append((source, []))

1260

current_source = source

1261

source_keys[-1][1].append(key)

1262

return source_keys

1263

1264

def _get_io_ordered_source_keys(self, locations, unadded_keys,

1265

source_result):

1266

def get_group(key):

1267

# This is the group the bytes are stored in, followed by the

1268

# location in the group

1269

return locations[key][0]

1270

present_keys = sorted(locations.iterkeys(), key=get_group)

1271

# We don't have an ordering for keys in the in-memory object, but

1272

# lets process the in-memory ones first.

1273

present_keys = list(unadded_keys) + present_keys

1274

# Now grab all of the ones from other sources

1275

source_keys = [(self, present_keys)]

1276

source_keys.extend(source_result)

1277

return source_keys

1278

1279

def _get_remaining_record_stream(self, keys, orig_keys, ordering,

1280

include_delta_closure):

1281

"""Get a stream of records for keys.

1282

1283

:param keys: The keys to include.

1284

:param ordering: one of 'unordered', 'topological', 'groupcompress' or

1285

'as-requested'

1286

:param include_delta_closure: If True then the closure across any

1287

compression parents will be included (in the opaque data).

1288

:return: An iterator of ContentFactory objects, each of which is only

1289

valid until the iterator is advanced.

1290

"""

1291

# Cheap: iterate

1292

locations = self._index.get_build_details(keys)

1293

unadded_keys = set(self._unadded_refs).intersection(keys)

1294

missing = keys.difference(locations)

1295

missing.difference_update(unadded_keys)

1296

(fallback_parent_map, key_to_source_map,

1297

source_result) = self._find_from_fallback(missing)

1298

if ordering in ('topological', 'groupcompress'):

1299

# would be better to not globally sort initially but instead

1300

# start with one key, recurse to its oldest parent, then grab

1301

# everything in the same group, etc.

1302

parent_map = dict((key, details[2]) for key, details in

1303

locations.iteritems())

1304

for key in unadded_keys:

1305

parent_map[key] = self._unadded_refs[key]

1306

parent_map.update(fallback_parent_map)

1307

source_keys = self._get_ordered_source_keys(ordering, parent_map,

1308

key_to_source_map)

1309

elif ordering == 'as-requested':

1310

source_keys = self._get_as_requested_source_keys(orig_keys,

1311

locations, unadded_keys, key_to_source_map)

1312

else:

1313

# We want to yield the keys in a semi-optimal (read-wise) ordering.

1314

# Otherwise we thrash the _group_cache and destroy performance

1315

source_keys = self._get_io_ordered_source_keys(locations,

1316

unadded_keys, source_result)

1317

for key in missing:

1318

yield AbsentContentFactory(key)

1319

manager = None

1320

last_read_memo = None

1321

# TODO: This works fairly well at batching up existing groups into a

1322

# streamable format, and possibly allowing for taking one big

1323

# group and splitting it when it isn't fully utilized.

1324

# However, it doesn't allow us to find under-utilized groups and

1325

# combine them into a bigger group on the fly.

1326

# (Consider the issue with how chk_map inserts texts

1327

# one-at-a-time.) This could be done at insert_record_stream()

1328

# time, but it probably would decrease the number of

1329

# bytes-on-the-wire for fetch.

1330

for source, keys in source_keys:

1331

if source is self:

1332

for key in keys:

1333

if key in self._unadded_refs:

1334

if manager is not None:

1335

for factory in manager.get_record_stream():

1336

yield factory

1337

last_read_memo = manager = None

1338

bytes, sha1 = self._compressor.extract(key)

1339

parents = self._unadded_refs[key]

1340

yield FulltextContentFactory(key, parents, sha1, bytes)

1341

else:

1342

index_memo, _, parents, (method, _) = locations[key]

1343

read_memo = index_memo[0:3]

1344

if last_read_memo != read_memo:

1345

# We are starting a new block. If we have a

1346

# manager, we have found everything that fits for

1347

# now, so yield records

1348

if manager is not None:

1349

for factory in manager.get_record_stream():

1350

yield factory

1351

# Now start a new manager

1352

block = self._get_block(index_memo)

1353

manager = _LazyGroupContentManager(block)

1354

last_read_memo = read_memo

1355

start, end = index_memo[3:5]

1356

manager.add_factory(key, parents, start, end)

1357

else:

1358

if manager is not None:

1359

for factory in manager.get_record_stream():

1360

yield factory

1361

last_read_memo = manager = None

1362

for record in source.get_record_stream(keys, ordering,

1363

include_delta_closure):

1364

yield record

1365

if manager is not None:

1366

for factory in manager.get_record_stream():

1367

yield factory

1368

1369

def get_sha1s(self, keys):

1370

"""See VersionedFiles.get_sha1s()."""

1371

result = {}

1372

for record in self.get_record_stream(keys, 'unordered', True):

1373

if record.sha1 != None:

1374

result[record.key] = record.sha1

1375

else:

1376

if record.storage_kind != 'absent':

1377

result[record.key] = sha_string(record.get_bytes_as(

1378

'fulltext'))

1379

return result

1380

1381

def insert_record_stream(self, stream):

1382

"""Insert a record stream into this container.

1383

1384

:param stream: A stream of records to insert.

1385

:return: None

1386

:seealso VersionedFiles.get_record_stream:

1387

"""

1388

for _ in self._insert_record_stream(stream, random_id=True):

1389

pass

1390

1391

def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,

1392

reuse_blocks=True):

1393

"""Internal core to insert a record stream into this container.

1394

1395

This helper function has a different interface than insert_record_stream

1396

to allow add_lines to be minimal, but still return the needed data.

1397

1398

:param stream: A stream of records to insert.

1399

:param nostore_sha: If the sha1 of a given text matches nostore_sha,

1400

raise ExistingContent, rather than committing the new text.

1401

:param reuse_blocks: If the source is streaming from

1402

groupcompress-blocks, just insert the blocks as-is, rather than

1403

expanding the texts and inserting again.

1404

:return: An iterator over the sha1 of the inserted records.

1405

:seealso insert_record_stream:

1406

:seealso add_lines:

1407

"""

1408

adapters = {}

1409

def get_adapter(adapter_key):

1410

try:

1411

return adapters[adapter_key]

1412

except KeyError:

1413

adapter_factory = adapter_registry.get(adapter_key)

1414

adapter = adapter_factory(self)

1415

adapters[adapter_key] = adapter

1416

return adapter

1417

# This will go up to fulltexts for gc to gc fetching, which isn't

1418

# ideal.

1419

self._compressor = GroupCompressor()

1420

self._unadded_refs = {}

1421

keys_to_add = []

1422

def flush():

1423

bytes = self._compressor.flush().to_bytes()

1424

index, start, length = self._access.add_raw_records(

1425

[(None, len(bytes))], bytes)[0]

1426

nodes = []

1427

for key, reads, refs in keys_to_add:

1428

nodes.append((key, "%d %d %s" % (start, length, reads), refs))

1429

self._index.add_records(nodes, random_id=random_id)

1430

self._unadded_refs = {}

1431

del keys_to_add[:]

1432

self._compressor = GroupCompressor()

1433

1434

last_prefix = None

1435

last_fulltext_len = None

1436

max_fulltext_len = 0

1437

max_fulltext_prefix = None

1438

insert_manager = None

1439

block_start = None

1440

block_length = None

1441

# XXX: TODO: remove this, it is just for safety checking for now

1442

inserted_keys = set()

1443

for record in stream:

1444

# Raise an error when a record is missing.

1445

if record.storage_kind == 'absent':

1446

raise errors.RevisionNotPresent(record.key, self)

1447

if random_id:

1448

if record.key in inserted_keys:

1449

trace.note('Insert claimed random_id=True, but then inserted'

1450

' %r two times', record.key)

1451

continue

1452

inserted_keys.add(record.key)

1453

if reuse_blocks:

1454

# If the reuse_blocks flag is set, check to see if we can just

1455

# copy a groupcompress block as-is.

1456

if record.storage_kind == 'groupcompress-block':

1457

# Insert the raw block into the target repo

1458

insert_manager = record._manager

1459

insert_manager._check_rebuild_block()

1460

bytes = record._manager._block.to_bytes()

1461

_, start, length = self._access.add_raw_records(

1462

[(None, len(bytes))], bytes)[0]

1463

del bytes

1464

block_start = start

1465

block_length = length

1466

if record.storage_kind in ('groupcompress-block',

1467

'groupcompress-block-ref'):

1468

assert insert_manager is not None

1469

assert record._manager is insert_manager

1470

value = "%d %d %d %d" % (block_start, block_length,

1471

record._start, record._end)

1472

nodes = [(record.key, value, (record.parents,))]

1473

self._index.add_records(nodes, random_id=random_id)

1474

continue

1475

try:

1476

bytes = record.get_bytes_as('fulltext')

1477

except errors.UnavailableRepresentation:

1478

adapter_key = record.storage_kind, 'fulltext'

1479

adapter = get_adapter(adapter_key)

1480

bytes = adapter.get_bytes(record)

1481

if len(record.key) > 1:

1482

prefix = record.key[0]

1483

soft = (prefix == last_prefix)

1484

else:

1485

prefix = None

1486

soft = False

1487

if max_fulltext_len < len(bytes):

1488

max_fulltext_len = len(bytes)

1489

max_fulltext_prefix = prefix

1490

(found_sha1, start_point, end_point, type,

1491

length) = self._compressor.compress(record.key,

1492

bytes, record.sha1, soft=soft,

1493

nostore_sha=nostore_sha)

1494

# delta_ratio = float(len(bytes)) / length

1495

# Check if we want to continue to include that text

1496

if (prefix == max_fulltext_prefix

1497

and end_point < 2 * max_fulltext_len):

1498

# As long as we are on the same file_id, we will fill at least

1499

# 2 * max_fulltext_len

1500

start_new_block = False

1501

elif end_point > 4*1024*1024:

1502

start_new_block = True

1503

elif (prefix is not None and prefix != last_prefix

1504

and end_point > 2*1024*1024):

1505

start_new_block = True

1506

else:

1507

start_new_block = False

1508

# if type == 'fulltext':

1509

# # If this is the first text, we don't do anything

1510

# if self._compressor.num_keys > 1:

1511

# if prefix is not None and prefix != last_prefix:

1512

# # We just inserted a fulltext for a different prefix

1513

# # (aka file-id).

1514

# if end_point > 512 * 1024:

1515

# start_new_block = True

1516

# # TODO: Consider packing several small texts together

1517

# # maybe only flush if end_point > some threshold

1518

# # if end_point > 512 * 1024 or len(bytes) <

1519

# # start_new_block = true

1520

# else:

1521

# # We just added a fulltext, part of the same file-id

1522

# if (end_point > 2*1024*1024

1523

# and end_point > 5*max_fulltext_len):

1524

# start_new_block = True

1525

# last_fulltext_len = len(bytes)

1526

# else:

1527

# delta_ratio = float(len(bytes)) / length

1528

# if delta_ratio < 3: # Not much compression

1529

# if end_point > 1*1024*1024:

1530

# start_new_block = True

1531

# elif delta_ratio < 10: # 10:1 compression

1532

# if end_point > 4*1024*1024:

1533

# start_new_block = True

1534

last_prefix = prefix

1535

if start_new_block:

1536

self._compressor.pop_last()

1537

flush()

1538

max_fulltext_len = len(bytes)

1539

(found_sha1, start_point, end_point, type,

1540

length) = self._compressor.compress(record.key,

1541

bytes, record.sha1)

1542

last_fulltext_len = length

1543

if record.key[-1] is None:

1544

key = record.key[:-1] + ('sha1:' + found_sha1,)

1545

else:

1546

key = record.key

1547

self._unadded_refs[key] = record.parents

1548

yield found_sha1

1549

keys_to_add.append((key, '%d %d' % (start_point, end_point),

1550

(record.parents,)))

1551

if len(keys_to_add):

1552

flush()

1553

self._compressor = None

1554

1555

def iter_lines_added_or_present_in_keys(self, keys, pb=None):

1556

"""Iterate over the lines in the versioned files from keys.

1557

1558

This may return lines from other keys. Each item the returned

1559

iterator yields is a tuple of a line and a text version that that line

1560

is present in (not introduced in).

1561

1562

Ordering of results is in whatever order is most suitable for the

1563

underlying storage format.

1564

1565

If a progress bar is supplied, it may be used to indicate progress.

1566

The caller is responsible for cleaning up progress bars (because this

1567

is an iterator).

1568

1569

NOTES:

1570

* Lines are normalised by the underlying store: they will all have \n

1571

terminators.

1572

* Lines are returned in arbitrary order.

1573

1574

:return: An iterator over (line, key).

1575

"""

1576

if pb is None:

1577

pb = progress.DummyProgress()

1578

keys = set(keys)

1579

total = len(keys)

1580

# we don't care about inclusions, the caller cares.

1581

# but we need to setup a list of records to visit.

1582

# we need key, position, length

1583

for key_idx, record in enumerate(self.get_record_stream(keys,

1584

'unordered', True)):

1585

# XXX: todo - optimise to use less than full texts.

1586

key = record.key

1587

pb.update('Walking content', key_idx, total)

1588

if record.storage_kind == 'absent':

1589

raise errors.RevisionNotPresent(key, self)

1590

lines = split_lines(record.get_bytes_as('fulltext'))

1591

for line in lines:

1592

yield line, key

1593

pb.update('Walking content', total, total)

1594

1595

def keys(self):

1596

"""See VersionedFiles.keys."""

1597

if 'evil' in debug.debug_flags:

1598

trace.mutter_callsite(2, "keys scales with size of history")

1599

sources = [self._index] + self._fallback_vfs

1600

result = set()

1601

for source in sources:

1602

result.update(source.keys())

1603

return result

1604

1605

1606

class _GCGraphIndex(object):

1607

"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""

1608

1609

def __init__(self, graph_index, is_locked, parents=True,

1610

add_callback=None):

1611

"""Construct a _GCGraphIndex on a graph_index.

1612

1613

:param graph_index: An implementation of bzrlib.index.GraphIndex.

1614

:param is_locked: A callback, returns True if the index is locked and

1615

thus usable.

1616

:param parents: If True, record knits parents, if not do not record

1617

parents.

1618

:param add_callback: If not None, allow additions to the index and call

1619

this callback with a list of added GraphIndex nodes:

1620

[(node, value, node_refs), ...]

1621

"""

1622

self._add_callback = add_callback

1623

self._graph_index = graph_index

1624

self._parents = parents

1625

self.has_graph = parents

1626

self._is_locked = is_locked

1627

1628

def add_records(self, records, random_id=False):

1629

"""Add multiple records to the index.

1630

1631

This function does not insert data into the Immutable GraphIndex

1632

backing the KnitGraphIndex, instead it prepares data for insertion by

1633

the caller and checks that it is safe to insert then calls

1634

self._add_callback with the prepared GraphIndex nodes.

1635

1636

:param records: a list of tuples:

1637

(key, options, access_memo, parents).

1638

:param random_id: If True the ids being added were randomly generated

1639

and no check for existence will be performed.

1640

"""

1641

if not self._add_callback:

1642

raise errors.ReadOnlyError(self)

1643

# we hope there are no repositories with inconsistent parentage

1644

# anymore.

1645

1646

changed = False

1647

keys = {}

1648

for (key, value, refs) in records:

1649

if not self._parents:

1650

if refs:

1651

for ref in refs:

1652

if ref:

1653

raise KnitCorrupt(self,

1654

"attempt to add node with parents "

1655

"in parentless index.")

1656

refs = ()

1657

changed = True

1658

keys[key] = (value, refs)

1659

# check for dups

1660

if not random_id:

1661

present_nodes = self._get_entries(keys)

1662

for (index, key, value, node_refs) in present_nodes:

1663

if node_refs != keys[key][1]:

1664

raise errors.KnitCorrupt(self, "inconsistent details in add_records"

1665

": %s %s" % ((value, node_refs), keys[key]))

1666

del keys[key]

1667

changed = True

1668

if changed:

1669

result = []

1670

if self._parents:

1671

for key, (value, node_refs) in keys.iteritems():

1672

result.append((key, value, node_refs))

1673

else:

1674

for key, (value, node_refs) in keys.iteritems():

1675

result.append((key, value))

1676

records = result

1677

self._add_callback(records)

1678

1679

def _check_read(self):

1680

"""Raise an exception if reads are not permitted."""

1681

if not self._is_locked():

1682

raise errors.ObjectNotLocked(self)

1683

1684

def _check_write_ok(self):

1685

"""Raise an exception if writes are not permitted."""

1686

if not self._is_locked():

1687

raise errors.ObjectNotLocked(self)

1688

1689

def _get_entries(self, keys, check_present=False):

1690

"""Get the entries for keys.

1691

1692

Note: Callers are responsible for checking that the index is locked

1693

before calling this method.

1694

1695

:param keys: An iterable of index key tuples.

1696

"""

1697

keys = set(keys)

1698

found_keys = set()

1699

if self._parents:

1700

for node in self._graph_index.iter_entries(keys):

1701

yield node

1702

found_keys.add(node[1])

1703

else:

1704

# adapt parentless index to the rest of the code.

1705

for node in self._graph_index.iter_entries(keys):

1706

yield node[0], node[1], node[2], ()

1707

found_keys.add(node[1])

1708

if check_present:

1709

missing_keys = keys.difference(found_keys)

1710

if missing_keys:

1711

raise RevisionNotPresent(missing_keys.pop(), self)

1712

1713

def get_parent_map(self, keys):

1714

"""Get a map of the parents of keys.

1715

1716

:param keys: The keys to look up parents for.

1717

:return: A mapping from keys to parents. Absent keys are absent from

1718

the mapping.

1719

"""

1720

self._check_read()

1721

nodes = self._get_entries(keys)

1722

result = {}

1723

if self._parents:

1724

for node in nodes:

1725

result[node[1]] = node[3][0]

1726

else:

1727

for node in nodes:

1728

result[node[1]] = None

1729

return result

1730

1731

def get_build_details(self, keys):

1732

"""Get the various build details for keys.

1733

1734

Ghosts are omitted from the result.

1735

1736

:param keys: An iterable of keys.

1737

:return: A dict of key:

1738

(index_memo, compression_parent, parents, record_details).

1739

index_memo

1740

opaque structure to pass to read_records to extract the raw

1741

data

1742

compression_parent

1743

Content that this record is built upon, may be None

1744

parents

1745

Logical parents of this node

1746

record_details

1747

extra information about the content which needs to be passed to

1748

Factory.parse_record

1749

"""

1750

self._check_read()

1751

result = {}

1752

entries = self._get_entries(keys)

1753

for entry in entries:

1754

key = entry[1]

1755

if not self._parents:

1756

parents = None

1757

else:

1758

parents = entry[3][0]

1759

method = 'group'

1760

result[key] = (self._node_to_position(entry),

1761

None, parents, (method, None))

1762

return result

1763

1764

def keys(self):

1765

"""Get all the keys in the collection.

1766

1767

The keys are not ordered.

1768

"""

1769

self._check_read()

1770

return [node[1] for node in self._graph_index.iter_all_entries()]

1771

1772

def _node_to_position(self, node):

1773

"""Convert an index value to position details."""

1774

bits = node[2].split(' ')

1775

# It would be nice not to read the entire gzip.

1776

start = int(bits[0])

1777

stop = int(bits[1])

1778

basis_end = int(bits[2])

1779

delta_end = int(bits[3])

1780

return node[0], start, stop, basis_end, delta_end

1781

1782

1783

try:

1784

from bzrlib import _groupcompress_pyx

1785

except ImportError:

1786

pass