~bzr-pqm/bzr/bzr.dev

Committer: John Arbash Meinel
Date: 2009-03-05 17:20:17 UTC
mfrom: (0.25.16 internal_index)
mto: (3735.31.1 groupcompress) (0.17.44 groupcompress)
mto: This revision was merged to the branch mainline in revision 4280.
Revision ID: john@arbash-meinel.com-20090305172017-mefnbegtuk4vt99i

Another disk-format bump.

Move the labels/sha1 information into a pre-header. This also makes it
easier to decide to enable/disable the headers, as we can support
both with the same deserialising code (at least until we remove
the extra info from the indexes.)

This also makes a fulltext record stream start with 'f' and a delta
record stream start with 'd', which makes them more self describing.
The next step would probably be to write the base128 length of the
encoded bytes, which would make them fully independent, though
you wouldn't know what content they refer to.

This also brings in an update to .compress() which allows us to
see that we overflowed our group, roll back and start a new one.
This seems to give better compression in a 'more stable' manner.
Still open to tweaking, though.

Also introduce the 'gcc-chk255-big' which uses 64k leaf pages
rather than 4k leaf pages. Initial results show smaller compressed
size at a small (10%) increase in uncompressed size. Also shows
a full level decrease in the tree depth.

No-labels decreases the inv size approx 300kB, and big-page decreases
the inv size another 300kB, not to mention the 116k decrease in the
.cix index, just from not having the extra pages.

Having both no-labels and big inv pages brings a total drop of
11023k down to 9847k for the repo (1176kB savings, or 10% overall).

For now, leave the default with labels, but consider changing it.

files removed:
equivalence_table.py

files modified:
__init__.py

errors.py

groupcompress.py

repofmt.py

tests/test_groupcompress.py

Show diffs side-by-side

added added

removed removed

groupcompress.py

from itertools import izip

from cStringIO import StringIO

import struct

import zlib

from bzrlib import (

)

from bzrlib.btree_index import BTreeBuilder

from bzrlib.lru_cache import LRUSizeCache

from bzrlib.plugins.groupcompress import equivalence_table

from bzrlib.tsort import topo_sort

from bzrlib.versionedfile import (

adapter_registry,

FulltextContentFactory,

VersionedFiles,

)

from bzrlib.plugins.groupcompress import errors as gc_errors

_NO_LABELS = False

_FAST = False

def parse(bytes):

if _NO_LABELS:

action_byte = bytes[0]

action = {'f':'fulltext', 'd':'delta'}[action_byte]

return action, None, None, bytes[1:]

(action, label_line, sha1_line, len_line,

delta_bytes) = bytes.split('\n', 4)

if (action not in ('fulltext', 'delta')

or not label_line.startswith('label:')

or not sha1_line.startswith('sha1:')

or not len_line.startswith('len:')

raise AssertionError("bad text record %r" % (bytes,))

label = tuple(label_line[6:].split('\x00'))

sha1 = sha1_line[5:]

length = int(len_line[4:])

if not len(delta_bytes) == length:

raise AssertionError("bad length record %r" % (bytes,))

return action, label, sha1, delta_bytes

def encode_base128_int(val):

"""Convert an integer into a 7-bit lsb encoding."""

bytes = []

count = 0

while val >= 0x80:

bytes.append(chr((val | 0x80) & 0xFF))

val >>= 7

bytes.append(chr(val))

return ''.join(bytes)

def decode_base128_int(bytes):

"""Decode an integer from a 7-bit lsb encoding."""

offset = 0

val = 0

shift = 0

bval = ord(bytes[offset])

while bval >= 0x80:

val |= (bval & 0x7F) << shift

shift += 7

offset += 1

bval = ord(bytes[offset])

val |= bval << shift

offset += 1

return val, offset

def sort_gc_optimal(parent_map):

104

111

return present_keys

105

112

106

113

114

class GroupCompressBlockEntry(object):

115

"""Track the information about a single object inside a GC group.

116

117

This is generally just the dumb data structure.

118

"""

119

120

def __init__(self, key, type, sha1, start, length):

121

self.key = key

122

self.type = type # delta, fulltext, external?

123

self.sha1 = sha1 # Sha1 of content

124

self.start = start # Byte offset to start of data

125

self.length = length # Length of content

126

127

def __repr__(self):

128

return '%s(%s, %s, %s, %s, %s)' % (

129

self.__class__.__name__,

130

self.key, self.type, self.sha1, self.start, self.length

131

)

132

133

134

class GroupCompressBlock(object):

135

"""An object which maintains the internal structure of the compressed data.

136

137

This tracks the meta info (start of text, length, type, etc.)

138

"""

139

140

# Group Compress Block v1 Zlib

141

GCB_HEADER = 'gcb1z\n'

142

143

def __init__(self):

144

# map by key? or just order in file?

145

self._entries = {}

146

self._content = None

147

self._size = 0

148

149

def _parse_header(self):

150

"""Parse the meta-info from the stream."""

151

152

def __len__(self):

153

return self._size

154

155

@classmethod

156

def from_bytes(cls, bytes):

157

out = cls()

158

if bytes[:6] != cls.GCB_HEADER:

159

raise gc_errors.InvalidGroupCompressBlock(

160

'bytes did not start with %r' % (cls.GCB_HEADER,))

161

pos = bytes.index('\n', 6)

162

z_header_length = int(bytes[6:pos])

163

pos += 1

164

pos2 = bytes.index('\n', pos)

165

header_length = int(bytes[pos:pos2])

166

if z_header_length == 0:

167

assert header_length == 0

168

out._content = zlib.decompress(bytes[pos2+1:])

169

out._size = len(out._content)

170

return out

171

pos = pos2 + 1

172

pos2 = pos + z_header_length

173

z_header_bytes = bytes[pos:pos2]

174

assert len(z_header_bytes) == z_header_length

175

d = zlib.decompressobj()

176

header_bytes = d.decompress(z_header_bytes)

177

assert len(header_bytes) == header_length

178

del z_header_bytes

179

lines = header_bytes.split('\n')

180

header_len = len(header_bytes)

181

del header_bytes

182

info_dict = {}

183

for line in lines:

184

if not line: #End of record

185

if not info_dict:

186

break

187

out.add_entry(**info_dict)

188

info_dict = {}

189

continue

190

key, value = line.split(':', 1)

191

if key == 'key':

192

value = tuple(map(intern, value.split('\x00')))

193

elif key in ('start', 'length'):

194

value = int(value)

195

elif key == 'type':

196

value = intern(value)

197

info_dict[key] = value

198

zcontent = bytes[pos2:]

199

if zcontent:

200

out._content = d.decompress(zcontent)

201

assert d.flush() == ''

202

out._size = header_len + len(out._content)

203

return out

204

205

def extract(self, key, index_memo, sha1=None):

206

"""Extract the text for a specific key.

207

208

:param key: The label used for this content

209

:param sha1: TODO (should we validate only when sha1 is supplied?)

210

:return: The bytes for the content

211

"""

212

if _NO_LABELS or not self._entries:

213

start, end = index_memo[3:5]

214

c = self._content[start]

215

if c == 'f':

216

bytes = self._content[start+1:end]

217

type = 'fulltext'

218

else:

219

assert c == 'd'

220

type = 'delta'

221

delta = self._content[start+1:end]

222

bytes = _groupcompress_pyx.apply_delta(self._content, delta)

223

entry = GroupCompressBlockEntry(key, type, sha_string(bytes),

224

start, end-start)

225

return entry, bytes

226

entry = self._entries[key]

227

if entry.type == 'fulltext':

228

assert self._content[entry.start] == 'f'

229

bytes = self._content[entry.start+1:entry.start + entry.length]

230

elif entry.type == 'delta':

231

assert self._content[entry.start] == 'd'

232

delta = self._content[entry.start+1:entry.start + entry.length]

233

bytes = _groupcompress_pyx.apply_delta(self._content, delta)

234

# XXX: sha1?

235

return entry, bytes

236

237

def add_entry(self, key, type, sha1, start, length):

238

"""Add new meta info about an entry.

239

240

:param key: The key for the new content

241

:param type: Whether this is a delta or fulltext entry (external?)

242

:param sha1: sha1sum of the fulltext of this entry

243

:param start: where the encoded bytes start

244

:param length: total number of bytes in the encoded form

245

:return: The entry?

246

"""

247

entry = GroupCompressBlockEntry(key, type, sha1, start, length)

248

assert key not in self._entries

249

self._entries[key] = entry

250

return entry

251

252

def to_bytes(self, content=''):

253

"""Encode the information into a byte stream."""

254

chunks = []

255

for key in sorted(self._entries):

256

entry = self._entries[key]

257

chunk = ('key:%s\n'

258

'sha1:%s\n'

259

'type:%s\n'

260

'start:%s\n'

261

'length:%s\n'

262

'\n'

263

) % ('\x00'.join(entry.key),

264

entry.sha1,

265

entry.type,

266

entry.start,

267

entry.length,

268

)

269

chunks.append(chunk)

270

bytes = ''.join(chunks)

271

info_len = len(bytes)

272

c = zlib.compressobj()

273

z_bytes = []

274

z_bytes.append(c.compress(bytes))

275

del bytes

276

# TODO: we may want to have the header compressed in the same chain

277

# as the data, or we may not, evaulate it

278

# having them compressed together is probably a win for

279

# revisions and the 'inv' portion of chk inventories. As the

280

# label in the header is duplicated in the text.

281

# For chk pages and real bytes, I would guess this is not

282

# true.

283

z_bytes.append(c.flush(zlib.Z_SYNC_FLUSH))

284

z_len = sum(map(len, z_bytes))

285

c_len = len(content)

286

if _NO_LABELS:

287

z_bytes = []

288

z_len = 0

289

info_len = 0

290

c = zlib.compressobj()

291

z_bytes.append(c.compress(content))

292

z_bytes.append(c.flush())

293

chunks = [self.GCB_HEADER,

294

'%d\n' % (z_len,),

295

'%d\n' % (info_len,),

296

#'%d\n' % (c_len,),

297

]

298

chunks.extend(z_bytes)

299

return ''.join(chunks)

300

301

107

302

class GroupCompressor(object):

108

303

"""Produce a serialised group of compressed texts.

109

304

131

326

self.lines = []

132

327

self.endpoint = 0

133

328

self.input_bytes = 0

329

self.num_keys = 0

134

330

self.labels_deltas = {}

331

self._last = None

135

332

self._delta_index = _groupcompress_pyx.DeltaIndex()

333

self._block = GroupCompressBlock()

136

334

137

335

def compress(self, key, bytes, expected_sha, soft=False):

138

336

"""Compress lines with label key.

156

354

sha1 = expected_sha

157

355

if key[-1] is None:

158

356

key = key[:-1] + ('sha1:' + sha1,)

159

label = '\x00'.join(key)

160

357

input_len = len(bytes)

161

358

# By having action/label/sha1/len, we can parse the group if the index

162

359

# was ever destroyed, we have the key in 'label', we know the final

166

363

# 'len: %d\n' costs approximately 1% increase in total data

167

364

# Having the labels at all costs us 9-10% increase, 38% increase for

168

365

# inventory pages, and 5.8% increase for text pages

169

if _NO_LABELS:

170

new_chunks = []

171

else:

172

new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]

366

# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]

173

367

if self._delta_index._source_offset != self.endpoint:

174

368

raise AssertionError('_source_offset != endpoint'

175

369

' somehow the DeltaIndex got out of sync with'

177

371

max_delta_size = len(bytes) / 2

178

372

delta = self._delta_index.make_delta(bytes, max_delta_size)

179

373

if (delta is None):

180

# We can't delta (perhaps source_text is empty)

181

# so mark this as an insert

182

if _NO_LABELS:

183

new_chunks = ['f']

184

else:

185

new_chunks.insert(0, 'fulltext\n')

186

new_chunks.append('len:%s\n' % (input_len,))

187

unadded_bytes = sum(map(len, new_chunks))

188

self._delta_index.add_source(bytes, unadded_bytes)

189

new_chunks.append(bytes)

374

type = 'fulltext'

375

length = len(bytes) + 1

376

self._delta_index.add_source(bytes, 1)

377

new_chunks = ['f', bytes]

190

378

else:

191

if _NO_LABELS:

192

new_chunks = ['d']

193

else:

194

new_chunks.insert(0, 'delta\n')

195

new_chunks.append('len:%s\n' % (len(delta),))

379

type = 'delta'

380

length = len(delta) + 1

381

new_chunks = ['d', delta]

196

382

if _FAST:

197

new_chunks.append(delta)

198

unadded_bytes = sum(map(len, new_chunks))

199

self._delta_index._source_offset += unadded_bytes

383

self._delta_index._source_offset += length

200

384

else:

201

unadded_bytes = sum(map(len, new_chunks))

202

self._delta_index.add_delta_source(delta, unadded_bytes)

203

new_chunks.append(delta)

385

self._delta_index.add_delta_source(delta, 1)

386

self._block.add_entry(key, type=type, sha1=sha1,

387

start=self.endpoint, length=length)

204

388

delta_start = (self.endpoint, len(self.lines))

389

self.num_keys += 1

205

390

self.output_chunks(new_chunks)

206

391

self.input_bytes += input_len

207

392

delta_end = (self.endpoint, len(self.lines))

210

395

raise AssertionError('the delta index is out of sync'

211

396

'with the output lines %s != %s'

212

397

% (self._delta_index._source_offset, self.endpoint))

213

return sha1, self.endpoint

398

return sha1, self.endpoint, type, length

214

399

215

400

def extract(self, key):

216

401

"""Extract a key previously added to the compressor.

220

405

"""

221

406

delta_details = self.labels_deltas[key]

222

407

delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]

223

action, label, sha1, delta = parse(''.join(delta_chunks))

224

if not _NO_LABELS and label != key:

225

raise AssertionError("wrong key: %r, wanted %r" % (label, key))

226

if action == 'fulltext':

227

bytes = delta

228

else:

229

source = ''.join(self.lines[delta_details[0][0]])

230

bytes = _groupcompress_pyx.apply_delta(source, delta)

231

if _NO_LABELS:

232

sha1 = sha_string(bytes)

233

else:

234

assert sha1 == sha_string(bytes)

235

return [bytes], sha1

408

stored_bytes = ''.join(delta_chunks)

409

# TODO: Fix this, we shouldn't really be peeking here

410

entry = self._block._entries[key]

411

if entry.type == 'fulltext':

412

bytes = stored_bytes

413

else:

414

assert entry.type == 'delta'

415

# XXX: This is inefficient at best

416

source = ''.join(self.lines)

417

bytes = _groupcompress_pyx.apply_delta(source, stored_bytes)

418

assert entry.sha1 == sha_string(bytes)

419

return bytes, entry.sha1

236

420

237

421

def output_chunks(self, new_chunks):

238

422

"""Output some chunks.

239

423

240

424

:param new_chunks: The chunks to output.

241

425

"""

426

self._last = (len(self.lines), self.endpoint)

242

427

endpoint = self.endpoint

243

428

self.lines.extend(new_chunks)

244

429

endpoint += sum(map(len, new_chunks))

245

430

self.endpoint = endpoint

246

431

432

def pop_last(self):

433

"""Call this if you want to 'revoke' the last compression.

434

435

After this, the data structures will be rolled back, but you cannot do

436

more compression.

437

"""

438

self._delta_index = None

439

del self.lines[self._last[0]:]

440

self.endpoint = self._last[1]

441

self._last = None

442

247

443

def ratio(self):

248

444

"""Return the overall compression ratio."""

249

445

return float(self.input_bytes) / float(self.endpoint)

425

621

result[key] = self._unadded_refs[key]

426

622

return result

427

623

428

def _get_group_and_delta_bytes(self, index_memo):

624

def _get_block(self, index_memo):

429

625

read_memo = index_memo[0:3]

430

626

# get the group:

431

627

try:

432

plain = self._group_cache[read_memo]

628

block = self._group_cache[read_memo]

433

629

except KeyError:

434

630

# read the group

435

631

zdata = self._access.get_raw_records([read_memo]).next()

437

633

# permits caching. We might want to store the partially

438

634

# decompresed group and decompress object, so that recent

439

635

# texts are not penalised by big groups.

440

plain = zlib.decompress(zdata) #, index_memo[4])

441

self._group_cache[read_memo] = plain

636

block = GroupCompressBlock.from_bytes(zdata)

637

self._group_cache[read_memo] = block

442

638

# cheapo debugging:

443

639

# print len(zdata), len(plain)

444

640

# parse - requires split_lines, better to have byte offsets

445

641

# here (but not by much - we only split the region for the

446

642

# recipe, and we often want to end up with lines anyway.

447

return plain, plain[index_memo[3]:index_memo[4]]

643

return block

448

644

449

645

def get_missing_compression_parent_keys(self):

450

646

"""Return the keys of missing compression parents.

523

719

parents = self._unadded_refs[key]

524

720

else:

525

721

index_memo, _, parents, (method, _) = locations[key]

526

plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)

527

action, label, sha1, delta = parse(delta_bytes)

528

if not _NO_LABELS and label != key:

529

raise AssertionError("wrong key: %r, wanted %r" % (label, key))

530

if action == 'fulltext':

531

chunks = [delta]

532

else:

533

# TODO: relax apply_delta so that it can allow source to be

534

# longer than expected

535

bytes = _groupcompress_pyx.apply_delta(plain, delta)

536

if bytes is None:

537

import pdb; pdb.set_trace()

538

chunks = [bytes]

539

del bytes

540

if _NO_LABELS:

541

sha1 = sha_strings(chunks)

542

else:

543

if not _FAST and sha_strings(chunks) != sha1:

544

raise AssertionError('sha1 sum did not match')

545

yield ChunkedContentFactory(key, parents, sha1, chunks)

722

block = self._get_block(index_memo)

723

entry, bytes = block.extract(key, index_memo)

724

sha1 = entry.sha1

725

if not _FAST and sha_string(bytes) != sha1:

726

raise AssertionError('sha1 sum did not match')

727

yield FulltextContentFactory(key, parents, sha1, bytes)

546

728

547

729

def get_sha1s(self, keys):

548

730

"""See VersionedFiles.get_sha1s()."""

592

774

self._unadded_refs = {}

593

775

keys_to_add = []

594

776

basis_end = 0

595

groups = 1

596

777

def flush():

597

compressed = zlib.compress(''.join(self._compressor.lines))

778

bytes = self._compressor._block.to_bytes(

779

''.join(self._compressor.lines))

598

780

index, start, length = self._access.add_raw_records(

599

[(None, len(compressed))], compressed)[0]

781

[(None, len(bytes))], bytes)[0]

600

782

nodes = []

601

783

for key, reads, refs in keys_to_add:

602

784

nodes.append((key, "%d %d %s" % (start, length, reads), refs))

603

785

self._index.add_records(nodes, random_id=random_id)

786

self._unadded_refs = {}

787

del keys_to_add[:]

788

self._compressor = GroupCompressor(self._delta)

789

604

790

last_prefix = None

791

last_fulltext_len = None

792

max_fulltext_len = 0

793

max_fulltext_prefix = None

605

794

for record in stream:

606

795

# Raise an error when a record is missing.

607

796

if record.storage_kind == 'absent':

612

801

adapter_key = record.storage_kind, 'fulltext'

613

802

adapter = get_adapter(adapter_key)

614

803

bytes = adapter.get_bytes(record)

615

soft = False

616

804

if len(record.key) > 1:

617

805

prefix = record.key[0]

618

if (last_prefix is not None and prefix != last_prefix):

619

soft = True

620

if basis_end > 1024 * 1024 * 2:

621

flush()

622

self._compressor = GroupCompressor(self._delta)

623

self._unadded_refs = {}

624

keys_to_add = []

625

basis_end = 0

626

groups += 1

627

last_prefix = prefix

628

found_sha1, end_point = self._compressor.compress(record.key,

806

soft = (prefix == last_prefix)

807

else:

808

prefix = None

809

soft = False

810

if max_fulltext_len < len(bytes):

811

max_fulltext_len = len(bytes)

812

max_fulltext_prefix = prefix

813

(found_sha1, end_point, type,

814

length) = self._compressor.compress(record.key,

629

815

bytes, record.sha1, soft=soft)

816

# delta_ratio = float(len(bytes)) / length

817

# Check if we want to continue to include that text

818

if (prefix == max_fulltext_prefix

819

and end_point < 2 * max_fulltext_len):

820

# As long as we are on the same file_id, we will fill at least

821

# 2 * max_fulltext_len

822

start_new_block = False

823

elif end_point > 4*1024*1024:

824

start_new_block = True

825

elif (prefix is not None and prefix != last_prefix

826

and end_point > 2*1024*1024):

827

start_new_block = True

828

else:

829

start_new_block = False

830

# if type == 'fulltext':

831

# # If this is the first text, we don't do anything

832

# if self._compressor.num_keys > 1:

833

# if prefix is not None and prefix != last_prefix:

834

# # We just inserted a fulltext for a different prefix

835

# # (aka file-id).

836

# if end_point > 512 * 1024:

837

# start_new_block = True

838

# # TODO: Consider packing several small texts together

839

# # maybe only flush if end_point > some threshold

840

# # if end_point > 512 * 1024 or len(bytes) <

841

# # start_new_block = true

842

# else:

843

# # We just added a fulltext, part of the same file-id

844

# if (end_point > 2*1024*1024

845

# and end_point > 5*max_fulltext_len):

846

# start_new_block = True

847

# last_fulltext_len = len(bytes)

848

# else:

849

# delta_ratio = float(len(bytes)) / length

850

# if delta_ratio < 3: # Not much compression

851

# if end_point > 1*1024*1024:

852

# start_new_block = True

853

# elif delta_ratio < 10: # 10:1 compression

854

# if end_point > 4*1024*1024:

855

# start_new_block = True

856

last_prefix = prefix

857

if start_new_block:

858

self._compressor.pop_last()

859

flush()

860

basis_end = 0

861

max_fulltext_len = len(bytes)

862

(found_sha1, end_point, type,

863

length) = self._compressor.compress(record.key,

864

bytes, record.sha1)

865

assert type == 'fulltext'

866

last_fulltext_len = length

630

867

if record.key[-1] is None:

631

868

key = record.key[:-1] + ('sha1:' + found_sha1,)

632

869

else:

636

873

keys_to_add.append((key, '%d %d' % (basis_end, end_point),

637

874

(record.parents,)))

638

875

basis_end = end_point

639

if basis_end > 1024 * 1024 * 4:

640

flush()

641

self._compressor = GroupCompressor(self._delta)

642

self._unadded_refs = {}

643

keys_to_add = []

644

basis_end = 0

645

groups += 1

646

876

if len(keys_to_add):

647

877

flush()

648

878

self._compressor = None

649

self._unadded_refs = {}

879

assert self._unadded_refs == {}

650

880

651

881

def iter_lines_added_or_present_in_keys(self, keys, pb=None):

652

882

"""Iterate over the lines in the versioned files from keys.

876

1106

return node[0], start, stop, basis_end, delta_end

877

1107

878

1108

879

def _get_longest_match(equivalence_table, pos, max_pos, locations):

880

"""Get the longest possible match for the current position."""

881

range_start = pos

882

range_len = 0

883

copy_ends = None

884

while pos < max_pos:

885

if locations is None:

886

locations = equivalence_table.get_idx_matches(pos)

887

if locations is None:

888

# No more matches, just return whatever we have, but we know that

889

# this last position is not going to match anything

890

pos += 1

891

break

892

else:

893

if copy_ends is None:

894

# We are starting a new range

895

copy_ends = [loc + 1 for loc in locations]

896

range_len = 1

897

locations = None # Consumed

898

else:

899

# We are currently in the middle of a match

900

next_locations = set(copy_ends).intersection(locations)

901

if len(next_locations):

902

# range continues

903

copy_ends = [loc + 1 for loc in next_locations]

904

range_len += 1

905

locations = None # Consumed

906

else:

907

# But we are done with this match, we should be

908

# starting a new one, though. We will pass back 'locations'

909

# so that we don't have to do another lookup.

910

break

911

pos += 1

912

if copy_ends is None:

913

return None, pos, locations

914

return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations

915

916

917

1109

try:

918

1110

from bzrlib.plugins.groupcompress import _groupcompress_pyx

919

1111

except ImportError:

Older »