1
# Copyright (C) 2008-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Tests for group compression."""
32
from bzrlib.osutils import sha_string
33
from bzrlib.tests.test__groupcompress import compiled_groupcompress_feature
34
from bzrlib.tests.scenarios import load_tests_apply_scenarios
37
def group_compress_implementation_scenarios():
39
('python', {'compressor': groupcompress.PythonGroupCompressor}),
41
if compiled_groupcompress_feature.available():
42
scenarios.append(('C',
43
{'compressor': groupcompress.PyrexGroupCompressor}))
47
load_tests = load_tests_apply_scenarios
50
class TestGroupCompressor(tests.TestCase):
52
def _chunks_to_repr_lines(self, chunks):
53
return '\n'.join(map(repr, ''.join(chunks).split('\n')))
55
def assertEqualDiffEncoded(self, expected, actual):
56
"""Compare the actual content to the expected content.
58
:param expected: A group of chunks that we expect to see
59
:param actual: The measured 'chunks'
61
We will transform the chunks back into lines, and then run 'repr()'
62
over them to handle non-ascii characters.
64
self.assertEqualDiff(self._chunks_to_repr_lines(expected),
65
self._chunks_to_repr_lines(actual))
68
class TestAllGroupCompressors(TestGroupCompressor):
69
"""Tests for GroupCompressor"""
71
scenarios = group_compress_implementation_scenarios()
72
compressor = None # Set by scenario
74
def test_empty_delta(self):
75
compressor = self.compressor()
76
self.assertEqual([], compressor.chunks)
78
def test_one_nosha_delta(self):
80
compressor = self.compressor()
81
sha1, start_point, end_point, _ = compressor.compress(('label',),
82
'strange\ncommon\n', None)
83
self.assertEqual(sha_string('strange\ncommon\n'), sha1)
84
expected_lines = 'f' '\x0f' 'strange\ncommon\n'
85
self.assertEqual(expected_lines, ''.join(compressor.chunks))
86
self.assertEqual(0, start_point)
87
self.assertEqual(sum(map(len, expected_lines)), end_point)
89
def test_empty_content(self):
90
compressor = self.compressor()
91
# Adding empty bytes should return the 'null' record
92
sha1, start_point, end_point, kind = compressor.compress(('empty',),
94
self.assertEqual(0, start_point)
95
self.assertEqual(0, end_point)
96
self.assertEqual('fulltext', kind)
97
self.assertEqual(groupcompress._null_sha1, sha1)
98
self.assertEqual(0, compressor.endpoint)
99
self.assertEqual([], compressor.chunks)
100
# Even after adding some content
101
compressor.compress(('content',), 'some\nbytes\n', None)
102
self.assertTrue(compressor.endpoint > 0)
103
sha1, start_point, end_point, kind = compressor.compress(('empty2',),
105
self.assertEqual(0, start_point)
106
self.assertEqual(0, end_point)
107
self.assertEqual('fulltext', kind)
108
self.assertEqual(groupcompress._null_sha1, sha1)
110
def test_extract_from_compressor(self):
111
# Knit fetching will try to reconstruct texts locally which results in
112
# reading something that is in the compressor stream already.
113
compressor = self.compressor()
114
sha1_1, _, _, _ = compressor.compress(('label',),
115
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
116
expected_lines = list(compressor.chunks)
117
sha1_2, _, end_point, _ = compressor.compress(('newlabel',),
118
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
120
self.assertEqual(('strange\ncommon long line\n'
121
'that needs a 16 byte match\n', sha1_1),
122
compressor.extract(('label',)))
124
self.assertEqual(('common long line\nthat needs a 16 byte match\n'
125
'different\n', sha1_2),
126
compressor.extract(('newlabel',)))
128
def test_pop_last(self):
129
compressor = self.compressor()
130
_, _, _, _ = compressor.compress(('key1',),
131
'some text\nfor the first entry\n', None)
132
expected_lines = list(compressor.chunks)
133
_, _, _, _ = compressor.compress(('key2',),
134
'some text\nfor the second entry\n', None)
135
compressor.pop_last()
136
self.assertEqual(expected_lines, compressor.chunks)
139
class TestPyrexGroupCompressor(TestGroupCompressor):
141
_test_needs_features = [compiled_groupcompress_feature]
142
compressor = groupcompress.PyrexGroupCompressor
144
def test_stats(self):
145
compressor = self.compressor()
146
compressor.compress(('label',),
148
'common very very long line\n'
149
'plus more text\n', None)
150
compressor.compress(('newlabel',),
151
'common very very long line\n'
154
'moredifferent\n', None)
155
compressor.compress(('label3',),
157
'common very very long line\n'
160
'moredifferent\n', None)
161
self.assertAlmostEqual(1.9, compressor.ratio(), 1)
163
def test_two_nosha_delta(self):
164
compressor = self.compressor()
165
sha1_1, _, _, _ = compressor.compress(('label',),
166
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
167
expected_lines = list(compressor.chunks)
168
sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
169
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
170
self.assertEqual(sha_string('common long line\n'
171
'that needs a 16 byte match\n'
172
'different\n'), sha1_2)
173
expected_lines.extend([
174
# 'delta', delta length
176
# source and target length
178
# copy the line common
179
'\x91\x0a\x2c', #copy, offset 0x0a, len 0x2c
180
# add the line different, and the trailing newline
181
'\x0adifferent\n', # insert 10 bytes
183
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
184
self.assertEqual(sum(map(len, expected_lines)), end_point)
186
def test_three_nosha_delta(self):
187
# The first interesting test: make a change that should use lines from
189
compressor = self.compressor()
190
sha1_1, _, _, _ = compressor.compress(('label',),
191
'strange\ncommon very very long line\nwith some extra text\n', None)
192
sha1_2, _, _, _ = compressor.compress(('newlabel',),
193
'different\nmoredifferent\nand then some more\n', None)
194
expected_lines = list(compressor.chunks)
195
sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
196
'new\ncommon very very long line\nwith some extra text\n'
197
'different\nmoredifferent\nand then some more\n',
200
sha_string('new\ncommon very very long line\nwith some extra text\n'
201
'different\nmoredifferent\nand then some more\n'),
203
expected_lines.extend([
204
# 'delta', delta length
206
# source and target length
210
# Copy of first parent 'common' range
211
'\x91\x09\x31' # copy, offset 0x09, 0x31 bytes
212
# Copy of second parent 'different' range
213
'\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
215
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
216
self.assertEqual(sum(map(len, expected_lines)), end_point)
219
class TestPythonGroupCompressor(TestGroupCompressor):
221
compressor = groupcompress.PythonGroupCompressor
223
def test_stats(self):
224
compressor = self.compressor()
225
compressor.compress(('label',),
227
'common very very long line\n'
228
'plus more text\n', None)
229
compressor.compress(('newlabel',),
230
'common very very long line\n'
233
'moredifferent\n', None)
234
compressor.compress(('label3',),
236
'common very very long line\n'
239
'moredifferent\n', None)
240
self.assertAlmostEqual(1.9, compressor.ratio(), 1)
242
def test_two_nosha_delta(self):
243
compressor = self.compressor()
244
sha1_1, _, _, _ = compressor.compress(('label',),
245
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
246
expected_lines = list(compressor.chunks)
247
sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
248
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
249
self.assertEqual(sha_string('common long line\n'
250
'that needs a 16 byte match\n'
251
'different\n'), sha1_2)
252
expected_lines.extend([
253
# 'delta', delta length
257
# copy the line common
258
'\x91\x0a\x2c', #copy, offset 0x0a, len 0x2c
259
# add the line different, and the trailing newline
260
'\x0adifferent\n', # insert 10 bytes
262
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
263
self.assertEqual(sum(map(len, expected_lines)), end_point)
265
def test_three_nosha_delta(self):
266
# The first interesting test: make a change that should use lines from
268
compressor = self.compressor()
269
sha1_1, _, _, _ = compressor.compress(('label',),
270
'strange\ncommon very very long line\nwith some extra text\n', None)
271
sha1_2, _, _, _ = compressor.compress(('newlabel',),
272
'different\nmoredifferent\nand then some more\n', None)
273
expected_lines = list(compressor.chunks)
274
sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
275
'new\ncommon very very long line\nwith some extra text\n'
276
'different\nmoredifferent\nand then some more\n',
279
sha_string('new\ncommon very very long line\nwith some extra text\n'
280
'different\nmoredifferent\nand then some more\n'),
282
expected_lines.extend([
283
# 'delta', delta length
289
# Copy of first parent 'common' range
290
'\x91\x0a\x30' # copy, offset 0x0a, 0x30 bytes
291
# Copy of second parent 'different' range
292
'\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
294
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
295
self.assertEqual(sum(map(len, expected_lines)), end_point)
298
class TestGroupCompressBlock(tests.TestCase):
300
def make_block(self, key_to_text):
301
"""Create a GroupCompressBlock, filling it with the given texts."""
302
compressor = groupcompress.GroupCompressor()
304
for key in sorted(key_to_text):
305
compressor.compress(key, key_to_text[key], None)
306
locs = dict((key, (start, end)) for key, (start, _, end, _)
307
in compressor.labels_deltas.iteritems())
308
block = compressor.flush()
309
raw_bytes = block.to_bytes()
310
# Go through from_bytes(to_bytes()) so that we start with a compressed
312
return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
314
def test_from_empty_bytes(self):
315
self.assertRaises(ValueError,
316
groupcompress.GroupCompressBlock.from_bytes, '')
318
def test_from_minimal_bytes(self):
319
block = groupcompress.GroupCompressBlock.from_bytes(
321
self.assertIsInstance(block, groupcompress.GroupCompressBlock)
322
self.assertIs(None, block._content)
323
self.assertEqual('', block._z_content)
324
block._ensure_content()
325
self.assertEqual('', block._content)
326
self.assertEqual('', block._z_content)
327
block._ensure_content() # Ensure content is safe to call 2x
329
def test_from_invalid(self):
330
self.assertRaises(ValueError,
331
groupcompress.GroupCompressBlock.from_bytes,
332
'this is not a valid header')
334
def test_from_bytes(self):
335
content = ('a tiny bit of content\n')
336
z_content = zlib.compress(content)
338
'gcb1z\n' # group compress block v1 plain
339
'%d\n' # Length of compressed content
340
'%d\n' # Length of uncompressed content
341
'%s' # Compressed content
342
) % (len(z_content), len(content), z_content)
343
block = groupcompress.GroupCompressBlock.from_bytes(
345
self.assertEqual(z_content, block._z_content)
346
self.assertIs(None, block._content)
347
self.assertEqual(len(z_content), block._z_content_length)
348
self.assertEqual(len(content), block._content_length)
349
block._ensure_content()
350
self.assertEqual(z_content, block._z_content)
351
self.assertEqual(content, block._content)
353
def test_to_chunks(self):
354
content_chunks = ['this is some content\n',
355
'this content will be compressed\n']
356
content_len = sum(map(len, content_chunks))
357
content = ''.join(content_chunks)
358
gcb = groupcompress.GroupCompressBlock()
359
gcb.set_chunked_content(content_chunks, content_len)
360
total_len, block_chunks = gcb.to_chunks()
361
block_bytes = ''.join(block_chunks)
362
self.assertEqual(gcb._z_content_length, len(gcb._z_content))
363
self.assertEqual(total_len, len(block_bytes))
364
self.assertEqual(gcb._content_length, content_len)
365
expected_header =('gcb1z\n' # group compress block v1 zlib
366
'%d\n' # Length of compressed content
367
'%d\n' # Length of uncompressed content
368
) % (gcb._z_content_length, gcb._content_length)
369
# The first chunk should be the header chunk. It is small, fixed size,
370
# and there is no compelling reason to split it up
371
self.assertEqual(expected_header, block_chunks[0])
372
self.assertStartsWith(block_bytes, expected_header)
373
remaining_bytes = block_bytes[len(expected_header):]
374
raw_bytes = zlib.decompress(remaining_bytes)
375
self.assertEqual(content, raw_bytes)
377
def test_to_bytes(self):
378
content = ('this is some content\n'
379
'this content will be compressed\n')
380
gcb = groupcompress.GroupCompressBlock()
381
gcb.set_content(content)
382
bytes = gcb.to_bytes()
383
self.assertEqual(gcb._z_content_length, len(gcb._z_content))
384
self.assertEqual(gcb._content_length, len(content))
385
expected_header =('gcb1z\n' # group compress block v1 zlib
386
'%d\n' # Length of compressed content
387
'%d\n' # Length of uncompressed content
388
) % (gcb._z_content_length, gcb._content_length)
389
self.assertStartsWith(bytes, expected_header)
390
remaining_bytes = bytes[len(expected_header):]
391
raw_bytes = zlib.decompress(remaining_bytes)
392
self.assertEqual(content, raw_bytes)
394
# we should get the same results if using the chunked version
395
gcb = groupcompress.GroupCompressBlock()
396
gcb.set_chunked_content(['this is some content\n'
397
'this content will be compressed\n'],
400
bytes = gcb.to_bytes()
401
self.assertEqual(old_bytes, bytes)
403
def test_partial_decomp(self):
405
# We need a sufficient amount of data so that zlib.decompress has
406
# partial decompression to work with. Most auto-generated data
407
# compresses a bit too well, we want a combination, so we combine a sha
408
# hash with compressible data.
409
for i in xrange(2048):
410
next_content = '%d\nThis is a bit of duplicate text\n' % (i,)
411
content_chunks.append(next_content)
412
next_sha1 = osutils.sha_string(next_content)
413
content_chunks.append(next_sha1 + '\n')
414
content = ''.join(content_chunks)
415
self.assertEqual(158634, len(content))
416
z_content = zlib.compress(content)
417
self.assertEqual(57182, len(z_content))
418
block = groupcompress.GroupCompressBlock()
419
block._z_content_chunks = (z_content,)
420
block._z_content_length = len(z_content)
421
block._compressor_name = 'zlib'
422
block._content_length = 158634
423
self.assertIs(None, block._content)
424
block._ensure_content(100)
425
self.assertIsNot(None, block._content)
426
# We have decompressed at least 100 bytes
427
self.assertTrue(len(block._content) >= 100)
428
# We have not decompressed the whole content
429
self.assertTrue(len(block._content) < 158634)
430
self.assertEqualDiff(content[:len(block._content)], block._content)
431
# ensuring content that we already have shouldn't cause any more data
433
cur_len = len(block._content)
434
block._ensure_content(cur_len - 10)
435
self.assertEqual(cur_len, len(block._content))
436
# Now we want a bit more content
438
block._ensure_content(cur_len)
439
self.assertTrue(len(block._content) >= cur_len)
440
self.assertTrue(len(block._content) < 158634)
441
self.assertEqualDiff(content[:len(block._content)], block._content)
442
# And now lets finish
443
block._ensure_content(158634)
444
self.assertEqualDiff(content, block._content)
445
# And the decompressor is finalized
446
self.assertIs(None, block._z_content_decompressor)
448
def test__ensure_all_content(self):
450
# We need a sufficient amount of data so that zlib.decompress has
451
# partial decompression to work with. Most auto-generated data
452
# compresses a bit too well, we want a combination, so we combine a sha
453
# hash with compressible data.
454
for i in xrange(2048):
455
next_content = '%d\nThis is a bit of duplicate text\n' % (i,)
456
content_chunks.append(next_content)
457
next_sha1 = osutils.sha_string(next_content)
458
content_chunks.append(next_sha1 + '\n')
459
content = ''.join(content_chunks)
460
self.assertEqual(158634, len(content))
461
z_content = zlib.compress(content)
462
self.assertEqual(57182, len(z_content))
463
block = groupcompress.GroupCompressBlock()
464
block._z_content_chunks = (z_content,)
465
block._z_content_length = len(z_content)
466
block._compressor_name = 'zlib'
467
block._content_length = 158634
468
self.assertIs(None, block._content)
469
# The first _ensure_content got all of the required data
470
block._ensure_content(158634)
471
self.assertEqualDiff(content, block._content)
472
# And we should have released the _z_content_decompressor since it was
474
self.assertIs(None, block._z_content_decompressor)
476
def test__dump(self):
477
dup_content = 'some duplicate content\nwhich is sufficiently long\n'
478
key_to_text = {('1',): dup_content + '1 unique\n',
479
('2',): dup_content + '2 extra special\n'}
480
locs, block = self.make_block(key_to_text)
481
self.assertEqual([('f', len(key_to_text[('1',)])),
482
('d', 21, len(key_to_text[('2',)]),
483
[('c', 2, len(dup_content)),
484
('i', len('2 extra special\n'), '')
489
class TestCaseWithGroupCompressVersionedFiles(
490
tests.TestCaseWithMemoryTransport):
492
def make_test_vf(self, create_graph, keylength=1, do_cleanup=True,
493
dir='.', inconsistency_fatal=True):
494
t = self.get_transport(dir)
496
vf = groupcompress.make_pack_factory(graph=create_graph,
497
delta=False, keylength=keylength,
498
inconsistency_fatal=inconsistency_fatal)(t)
500
self.addCleanup(groupcompress.cleanup_pack_group, vf)
504
class TestGroupCompressVersionedFiles(TestCaseWithGroupCompressVersionedFiles):
506
def make_g_index(self, name, ref_lists=0, nodes=[]):
507
builder = btree_index.BTreeBuilder(ref_lists)
508
for node, references, value in nodes:
509
builder.add_node(node, references, value)
510
stream = builder.finish()
511
trans = self.get_transport()
512
size = trans.put_file(name, stream)
513
return btree_index.BTreeGraphIndex(trans, name, size)
515
def make_g_index_missing_parent(self):
516
graph_index = self.make_g_index('missing_parent', 1,
517
[(('parent', ), '2 78 2 10', ([],)),
518
(('tip', ), '2 78 2 10',
519
([('parent', ), ('missing-parent', )],)),
523
def test_get_record_stream_as_requested(self):
524
# Consider promoting 'as-requested' to general availability, and
525
# make this a VF interface test
526
vf = self.make_test_vf(False, dir='source')
527
vf.add_lines(('a',), (), ['lines\n'])
528
vf.add_lines(('b',), (), ['lines\n'])
529
vf.add_lines(('c',), (), ['lines\n'])
530
vf.add_lines(('d',), (), ['lines\n'])
532
keys = [record.key for record in vf.get_record_stream(
533
[('a',), ('b',), ('c',), ('d',)],
534
'as-requested', False)]
535
self.assertEqual([('a',), ('b',), ('c',), ('d',)], keys)
536
keys = [record.key for record in vf.get_record_stream(
537
[('b',), ('a',), ('d',), ('c',)],
538
'as-requested', False)]
539
self.assertEqual([('b',), ('a',), ('d',), ('c',)], keys)
541
# It should work even after being repacked into another VF
542
vf2 = self.make_test_vf(False, dir='target')
543
vf2.insert_record_stream(vf.get_record_stream(
544
[('b',), ('a',), ('d',), ('c',)], 'as-requested', False))
547
keys = [record.key for record in vf2.get_record_stream(
548
[('a',), ('b',), ('c',), ('d',)],
549
'as-requested', False)]
550
self.assertEqual([('a',), ('b',), ('c',), ('d',)], keys)
551
keys = [record.key for record in vf2.get_record_stream(
552
[('b',), ('a',), ('d',), ('c',)],
553
'as-requested', False)]
554
self.assertEqual([('b',), ('a',), ('d',), ('c',)], keys)
556
def test_get_record_stream_max_bytes_to_index_default(self):
557
vf = self.make_test_vf(True, dir='source')
558
vf.add_lines(('a',), (), ['lines\n'])
560
record = vf.get_record_stream([('a',)], 'unordered', True).next()
561
self.assertEqual(vf._DEFAULT_COMPRESSOR_SETTINGS,
562
record._manager._get_compressor_settings())
564
def test_get_record_stream_accesses_compressor_settings(self):
565
vf = self.make_test_vf(True, dir='source')
566
vf.add_lines(('a',), (), ['lines\n'])
568
vf._max_bytes_to_index = 1234
569
record = vf.get_record_stream([('a',)], 'unordered', True).next()
570
self.assertEqual(dict(max_bytes_to_index=1234),
571
record._manager._get_compressor_settings())
573
def test_insert_record_stream_reuses_blocks(self):
574
vf = self.make_test_vf(True, dir='source')
575
def grouped_stream(revision_ids, first_parents=()):
576
parents = first_parents
577
for revision_id in revision_ids:
579
record = versionedfile.FulltextContentFactory(
581
'some content that is\n'
582
'identical except for\n'
583
'revision_id:%s\n' % (revision_id,))
587
vf.insert_record_stream(grouped_stream(['a', 'b', 'c', 'd']))
589
vf.insert_record_stream(grouped_stream(['e', 'f', 'g', 'h'],
590
first_parents=(('d',),)))
592
stream = vf.get_record_stream([(r,) for r in 'abcdefgh'],
595
for record in stream:
596
if record.key in [('a',), ('e',)]:
597
self.assertEqual('groupcompress-block', record.storage_kind)
599
self.assertEqual('groupcompress-block-ref',
601
block_bytes[record.key] = record._manager._block._z_content
603
self.assertEqual(8, num_records)
606
self.assertIs(block_bytes[key], block_bytes[('a',)])
607
self.assertNotEqual(block_bytes[key], block_bytes[('e',)])
610
self.assertIs(block_bytes[key], block_bytes[('e',)])
611
self.assertNotEqual(block_bytes[key], block_bytes[('a',)])
612
# Now copy the blocks into another vf, and ensure that the blocks are
613
# preserved without creating new entries
614
vf2 = self.make_test_vf(True, dir='target')
615
# ordering in 'groupcompress' order, should actually swap the groups in
616
# the target vf, but the groups themselves should not be disturbed.
617
def small_size_stream():
618
for record in vf.get_record_stream([(r,) for r in 'abcdefgh'],
619
'groupcompress', False):
620
record._manager._full_enough_block_size = \
621
record._manager._block._content_length
624
vf2.insert_record_stream(small_size_stream())
625
stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
626
'groupcompress', False)
629
for record in stream:
631
self.assertEqual(block_bytes[record.key],
632
record._manager._block._z_content)
633
self.assertEqual(8, num_records)
635
def test_insert_record_stream_packs_on_the_fly(self):
636
vf = self.make_test_vf(True, dir='source')
637
def grouped_stream(revision_ids, first_parents=()):
638
parents = first_parents
639
for revision_id in revision_ids:
641
record = versionedfile.FulltextContentFactory(
643
'some content that is\n'
644
'identical except for\n'
645
'revision_id:%s\n' % (revision_id,))
649
vf.insert_record_stream(grouped_stream(['a', 'b', 'c', 'd']))
651
vf.insert_record_stream(grouped_stream(['e', 'f', 'g', 'h'],
652
first_parents=(('d',),)))
653
# Now copy the blocks into another vf, and see that the
654
# insert_record_stream rebuilt a new block on-the-fly because of
656
vf2 = self.make_test_vf(True, dir='target')
657
vf2.insert_record_stream(vf.get_record_stream(
658
[(r,) for r in 'abcdefgh'], 'groupcompress', False))
659
stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
660
'groupcompress', False)
663
# All of the records should be recombined into a single block
665
for record in stream:
668
block = record._manager._block
670
self.assertIs(block, record._manager._block)
671
self.assertEqual(8, num_records)
673
def test__insert_record_stream_no_reuse_block(self):
674
vf = self.make_test_vf(True, dir='source')
675
def grouped_stream(revision_ids, first_parents=()):
676
parents = first_parents
677
for revision_id in revision_ids:
679
record = versionedfile.FulltextContentFactory(
681
'some content that is\n'
682
'identical except for\n'
683
'revision_id:%s\n' % (revision_id,))
687
vf.insert_record_stream(grouped_stream(['a', 'b', 'c', 'd']))
689
vf.insert_record_stream(grouped_stream(['e', 'f', 'g', 'h'],
690
first_parents=(('d',),)))
692
self.assertEqual(8, len(list(vf.get_record_stream(
693
[(r,) for r in 'abcdefgh'],
694
'unordered', False))))
695
# Now copy the blocks into another vf, and ensure that the blocks are
696
# preserved without creating new entries
697
vf2 = self.make_test_vf(True, dir='target')
698
# ordering in 'groupcompress' order, should actually swap the groups in
699
# the target vf, but the groups themselves should not be disturbed.
700
list(vf2._insert_record_stream(vf.get_record_stream(
701
[(r,) for r in 'abcdefgh'], 'groupcompress', False),
704
# After inserting with reuse_blocks=False, we should have everything in
705
# a single new block.
706
stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
707
'groupcompress', False)
709
for record in stream:
711
block = record._manager._block
713
self.assertIs(block, record._manager._block)
715
def test_add_missing_noncompression_parent_unvalidated_index(self):
716
unvalidated = self.make_g_index_missing_parent()
717
combined = _mod_index.CombinedGraphIndex([unvalidated])
718
index = groupcompress._GCGraphIndex(combined,
719
is_locked=lambda: True, parents=True,
720
track_external_parent_refs=True)
721
index.scan_unvalidated_index(unvalidated)
723
frozenset([('missing-parent',)]), index.get_missing_parents())
725
def test_track_external_parent_refs(self):
726
g_index = self.make_g_index('empty', 1, [])
727
mod_index = btree_index.BTreeBuilder(1, 1)
728
combined = _mod_index.CombinedGraphIndex([g_index, mod_index])
729
index = groupcompress._GCGraphIndex(combined,
730
is_locked=lambda: True, parents=True,
731
add_callback=mod_index.add_nodes,
732
track_external_parent_refs=True)
734
(('new-key',), '2 10 2 10', [(('parent-1',), ('parent-2',))])])
736
frozenset([('parent-1',), ('parent-2',)]),
737
index.get_missing_parents())
739
def make_source_with_b(self, a_parent, path):
740
source = self.make_test_vf(True, dir=path)
741
source.add_lines(('a',), (), ['lines\n'])
743
b_parents = (('a',),)
746
source.add_lines(('b',), b_parents, ['lines\n'])
749
def do_inconsistent_inserts(self, inconsistency_fatal):
750
target = self.make_test_vf(True, dir='target',
751
inconsistency_fatal=inconsistency_fatal)
753
source = self.make_source_with_b(x==1, 'source%s' % x)
754
target.insert_record_stream(source.get_record_stream(
755
[('b',)], 'unordered', False))
757
def test_inconsistent_redundant_inserts_warn(self):
758
"""Should not insert a record that is already present."""
760
def warning(template, args):
761
warnings.append(template % args)
762
_trace_warning = trace.warning
763
trace.warning = warning
765
self.do_inconsistent_inserts(inconsistency_fatal=False)
767
trace.warning = _trace_warning
768
self.assertEqual(["inconsistent details in skipped record: ('b',)"
769
" ('42 32 0 8', ((),)) ('74 32 0 8', ((('a',),),))"],
772
def test_inconsistent_redundant_inserts_raises(self):
773
e = self.assertRaises(errors.KnitCorrupt, self.do_inconsistent_inserts,
774
inconsistency_fatal=True)
775
self.assertContainsRe(str(e), "Knit.* corrupt: inconsistent details"
777
" \('b',\) \('42 32 0 8', \(\(\),\)\) \('74 32"
778
" 0 8', \(\(\('a',\),\),\)\)")
780
def test_clear_cache(self):
781
vf = self.make_source_with_b(True, 'source')
783
for record in vf.get_record_stream([('a',), ('b',)], 'unordered',
786
self.assertTrue(len(vf._group_cache) > 0)
788
self.assertEqual(0, len(vf._group_cache))
791
class TestGroupCompressConfig(tests.TestCaseWithTransport):
793
def make_test_vf(self):
794
t = self.get_transport('.')
796
factory = groupcompress.make_pack_factory(graph=True,
797
delta=False, keylength=1, inconsistency_fatal=True)
799
self.addCleanup(groupcompress.cleanup_pack_group, vf)
802
def test_max_bytes_to_index_default(self):
803
vf = self.make_test_vf()
804
gc = vf._make_group_compressor()
805
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
806
vf._max_bytes_to_index)
807
if isinstance(gc, groupcompress.PyrexGroupCompressor):
808
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
809
gc._delta_index._max_bytes_to_index)
811
def test_max_bytes_to_index_in_config(self):
812
c = config.GlobalConfig()
813
c.set_user_option('bzr.groupcompress.max_bytes_to_index', '10000')
814
vf = self.make_test_vf()
815
gc = vf._make_group_compressor()
816
self.assertEqual(10000, vf._max_bytes_to_index)
817
if isinstance(gc, groupcompress.PyrexGroupCompressor):
818
self.assertEqual(10000, gc._delta_index._max_bytes_to_index)
820
def test_max_bytes_to_index_bad_config(self):
821
c = config.GlobalConfig()
822
c.set_user_option('bzr.groupcompress.max_bytes_to_index', 'boogah')
823
vf = self.make_test_vf()
824
# TODO: This is triggering a warning, we might want to trap and make
825
# sure it is readable.
826
gc = vf._make_group_compressor()
827
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
828
vf._max_bytes_to_index)
829
if isinstance(gc, groupcompress.PyrexGroupCompressor):
830
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
831
gc._delta_index._max_bytes_to_index)
834
class StubGCVF(object):
835
def __init__(self, canned_get_blocks=None):
836
self._group_cache = {}
837
self._canned_get_blocks = canned_get_blocks or []
838
def _get_blocks(self, read_memos):
839
return iter(self._canned_get_blocks)
842
class Test_BatchingBlockFetcher(TestCaseWithGroupCompressVersionedFiles):
843
"""Simple whitebox unit tests for _BatchingBlockFetcher."""
845
def test_add_key_new_read_memo(self):
846
"""Adding a key with an uncached read_memo new to this batch adds that
847
read_memo to the list of memos to fetch.
849
# locations are: index_memo, ignored, parents, ignored
850
# where index_memo is: (idx, offset, len, factory_start, factory_end)
851
# and (idx, offset, size) is known as the 'read_memo', identifying the
853
read_memo = ('fake index', 100, 50)
855
('key',): (read_memo + (None, None), None, None, None)}
856
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), locations)
857
total_size = batcher.add_key(('key',))
858
self.assertEqual(50, total_size)
859
self.assertEqual([('key',)], batcher.keys)
860
self.assertEqual([read_memo], batcher.memos_to_get)
862
def test_add_key_duplicate_read_memo(self):
863
"""read_memos that occur multiple times in a batch will only be fetched
866
read_memo = ('fake index', 100, 50)
867
# Two keys, both sharing the same read memo (but different overall
870
('key1',): (read_memo + (0, 1), None, None, None),
871
('key2',): (read_memo + (1, 2), None, None, None)}
872
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), locations)
873
total_size = batcher.add_key(('key1',))
874
total_size = batcher.add_key(('key2',))
875
self.assertEqual(50, total_size)
876
self.assertEqual([('key1',), ('key2',)], batcher.keys)
877
self.assertEqual([read_memo], batcher.memos_to_get)
879
def test_add_key_cached_read_memo(self):
880
"""Adding a key with a cached read_memo will not cause that read_memo
881
to be added to the list to fetch.
883
read_memo = ('fake index', 100, 50)
885
gcvf._group_cache[read_memo] = 'fake block'
887
('key',): (read_memo + (None, None), None, None, None)}
888
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
889
total_size = batcher.add_key(('key',))
890
self.assertEqual(0, total_size)
891
self.assertEqual([('key',)], batcher.keys)
892
self.assertEqual([], batcher.memos_to_get)
894
def test_yield_factories_empty(self):
895
"""An empty batch yields no factories."""
896
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), {})
897
self.assertEqual([], list(batcher.yield_factories()))
899
def test_yield_factories_calls_get_blocks(self):
900
"""Uncached memos are retrieved via get_blocks."""
901
read_memo1 = ('fake index', 100, 50)
902
read_memo2 = ('fake index', 150, 40)
905
(read_memo1, groupcompress.GroupCompressBlock()),
906
(read_memo2, groupcompress.GroupCompressBlock())])
908
('key1',): (read_memo1 + (None, None), None, None, None),
909
('key2',): (read_memo2 + (None, None), None, None, None)}
910
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
911
batcher.add_key(('key1',))
912
batcher.add_key(('key2',))
913
factories = list(batcher.yield_factories(full_flush=True))
914
self.assertLength(2, factories)
915
keys = [f.key for f in factories]
916
kinds = [f.storage_kind for f in factories]
917
self.assertEqual([('key1',), ('key2',)], keys)
918
self.assertEqual(['groupcompress-block', 'groupcompress-block'], kinds)
920
def test_yield_factories_flushing(self):
921
"""yield_factories holds back on yielding results from the final block
922
unless passed full_flush=True.
924
fake_block = groupcompress.GroupCompressBlock()
925
read_memo = ('fake index', 100, 50)
927
gcvf._group_cache[read_memo] = fake_block
929
('key',): (read_memo + (None, None), None, None, None)}
930
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
931
batcher.add_key(('key',))
932
self.assertEqual([], list(batcher.yield_factories()))
933
factories = list(batcher.yield_factories(full_flush=True))
934
self.assertLength(1, factories)
935
self.assertEqual(('key',), factories[0].key)
936
self.assertEqual('groupcompress-block', factories[0].storage_kind)
939
class TestLazyGroupCompress(tests.TestCaseWithTransport):
942
('key1',): "this is a text\n"
943
"with a reasonable amount of compressible bytes\n"
944
"which can be shared between various other texts\n",
945
('key2',): "another text\n"
946
"with a reasonable amount of compressible bytes\n"
947
"which can be shared between various other texts\n",
948
('key3',): "yet another text which won't be extracted\n"
949
"with a reasonable amount of compressible bytes\n"
950
"which can be shared between various other texts\n",
951
('key4',): "this will be extracted\n"
952
"but references most of its bytes from\n"
953
"yet another text which won't be extracted\n"
954
"with a reasonable amount of compressible bytes\n"
955
"which can be shared between various other texts\n",
957
def make_block(self, key_to_text):
958
"""Create a GroupCompressBlock, filling it with the given texts."""
959
compressor = groupcompress.GroupCompressor()
961
for key in sorted(key_to_text):
962
compressor.compress(key, key_to_text[key], None)
963
locs = dict((key, (start, end)) for key, (start, _, end, _)
964
in compressor.labels_deltas.iteritems())
965
block = compressor.flush()
966
raw_bytes = block.to_bytes()
967
return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
969
def add_key_to_manager(self, key, locations, block, manager):
970
start, end = locations[key]
971
manager.add_factory(key, (), start, end)
973
def make_block_and_full_manager(self, texts):
974
locations, block = self.make_block(texts)
975
manager = groupcompress._LazyGroupContentManager(block)
976
for key in sorted(texts):
977
self.add_key_to_manager(key, locations, block, manager)
978
return block, manager
980
def test_get_fulltexts(self):
981
locations, block = self.make_block(self._texts)
982
manager = groupcompress._LazyGroupContentManager(block)
983
self.add_key_to_manager(('key1',), locations, block, manager)
984
self.add_key_to_manager(('key2',), locations, block, manager)
986
for record in manager.get_record_stream():
987
result_order.append(record.key)
988
text = self._texts[record.key]
989
self.assertEqual(text, record.get_bytes_as('fulltext'))
990
self.assertEqual([('key1',), ('key2',)], result_order)
992
# If we build the manager in the opposite order, we should get them
993
# back in the opposite order
994
manager = groupcompress._LazyGroupContentManager(block)
995
self.add_key_to_manager(('key2',), locations, block, manager)
996
self.add_key_to_manager(('key1',), locations, block, manager)
998
for record in manager.get_record_stream():
999
result_order.append(record.key)
1000
text = self._texts[record.key]
1001
self.assertEqual(text, record.get_bytes_as('fulltext'))
1002
self.assertEqual([('key2',), ('key1',)], result_order)
1004
def test__wire_bytes_no_keys(self):
1005
locations, block = self.make_block(self._texts)
1006
manager = groupcompress._LazyGroupContentManager(block)
1007
wire_bytes = manager._wire_bytes()
1008
block_length = len(block.to_bytes())
1009
# We should have triggered a strip, since we aren't using any content
1010
stripped_block = manager._block.to_bytes()
1011
self.assertTrue(block_length > len(stripped_block))
1012
empty_z_header = zlib.compress('')
1013
self.assertEqual('groupcompress-block\n'
1014
'8\n' # len(compress(''))
1016
'%d\n'# compressed block len
1019
% (len(stripped_block), empty_z_header,
1023
def test__wire_bytes(self):
1024
locations, block = self.make_block(self._texts)
1025
manager = groupcompress._LazyGroupContentManager(block)
1026
self.add_key_to_manager(('key1',), locations, block, manager)
1027
self.add_key_to_manager(('key4',), locations, block, manager)
1028
block_bytes = block.to_bytes()
1029
wire_bytes = manager._wire_bytes()
1030
(storage_kind, z_header_len, header_len,
1031
block_len, rest) = wire_bytes.split('\n', 4)
1032
z_header_len = int(z_header_len)
1033
header_len = int(header_len)
1034
block_len = int(block_len)
1035
self.assertEqual('groupcompress-block', storage_kind)
1036
self.assertEqual(34, z_header_len)
1037
self.assertEqual(26, header_len)
1038
self.assertEqual(len(block_bytes), block_len)
1039
z_header = rest[:z_header_len]
1040
header = zlib.decompress(z_header)
1041
self.assertEqual(header_len, len(header))
1042
entry1 = locations[('key1',)]
1043
entry4 = locations[('key4',)]
1044
self.assertEqualDiff('key1\n'
1046
'%d\n' # start offset
1052
% (entry1[0], entry1[1],
1053
entry4[0], entry4[1]),
1055
z_block = rest[z_header_len:]
1056
self.assertEqual(block_bytes, z_block)
1058
def test_from_bytes(self):
1059
locations, block = self.make_block(self._texts)
1060
manager = groupcompress._LazyGroupContentManager(block)
1061
self.add_key_to_manager(('key1',), locations, block, manager)
1062
self.add_key_to_manager(('key4',), locations, block, manager)
1063
wire_bytes = manager._wire_bytes()
1064
self.assertStartsWith(wire_bytes, 'groupcompress-block\n')
1065
manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
1066
self.assertIsInstance(manager, groupcompress._LazyGroupContentManager)
1067
self.assertEqual(2, len(manager._factories))
1068
self.assertEqual(block._z_content, manager._block._z_content)
1070
for record in manager.get_record_stream():
1071
result_order.append(record.key)
1072
text = self._texts[record.key]
1073
self.assertEqual(text, record.get_bytes_as('fulltext'))
1074
self.assertEqual([('key1',), ('key4',)], result_order)
1076
def test__check_rebuild_no_changes(self):
1077
block, manager = self.make_block_and_full_manager(self._texts)
1078
manager._check_rebuild_block()
1079
self.assertIs(block, manager._block)
1081
def test__check_rebuild_only_one(self):
1082
locations, block = self.make_block(self._texts)
1083
manager = groupcompress._LazyGroupContentManager(block)
1084
# Request just the first key, which should trigger a 'strip' action
1085
self.add_key_to_manager(('key1',), locations, block, manager)
1086
manager._check_rebuild_block()
1087
self.assertIsNot(block, manager._block)
1088
self.assertTrue(block._content_length > manager._block._content_length)
1089
# We should be able to still get the content out of this block, though
1090
# it should only have 1 entry
1091
for record in manager.get_record_stream():
1092
self.assertEqual(('key1',), record.key)
1093
self.assertEqual(self._texts[record.key],
1094
record.get_bytes_as('fulltext'))
1096
def test__check_rebuild_middle(self):
1097
locations, block = self.make_block(self._texts)
1098
manager = groupcompress._LazyGroupContentManager(block)
1099
# Request a small key in the middle should trigger a 'rebuild'
1100
self.add_key_to_manager(('key4',), locations, block, manager)
1101
manager._check_rebuild_block()
1102
self.assertIsNot(block, manager._block)
1103
self.assertTrue(block._content_length > manager._block._content_length)
1104
for record in manager.get_record_stream():
1105
self.assertEqual(('key4',), record.key)
1106
self.assertEqual(self._texts[record.key],
1107
record.get_bytes_as('fulltext'))
1109
def test_manager_default_compressor_settings(self):
1110
locations, old_block = self.make_block(self._texts)
1111
manager = groupcompress._LazyGroupContentManager(old_block)
1112
gcvf = groupcompress.GroupCompressVersionedFiles
1113
# It doesn't greedily evaluate _max_bytes_to_index
1114
self.assertIs(None, manager._compressor_settings)
1115
self.assertEqual(gcvf._DEFAULT_COMPRESSOR_SETTINGS,
1116
manager._get_compressor_settings())
1118
def test_manager_custom_compressor_settings(self):
1119
locations, old_block = self.make_block(self._texts)
1121
def compressor_settings():
1122
called.append('called')
1124
manager = groupcompress._LazyGroupContentManager(old_block,
1125
get_compressor_settings=compressor_settings)
1126
gcvf = groupcompress.GroupCompressVersionedFiles
1127
# It doesn't greedily evaluate compressor_settings
1128
self.assertIs(None, manager._compressor_settings)
1129
self.assertEqual((10,), manager._get_compressor_settings())
1130
self.assertEqual((10,), manager._get_compressor_settings())
1131
self.assertEqual((10,), manager._compressor_settings)
1132
# Only called 1 time
1133
self.assertEqual(['called'], called)
1135
def test__rebuild_handles_compressor_settings(self):
1136
if not isinstance(groupcompress.GroupCompressor,
1137
groupcompress.PyrexGroupCompressor):
1138
raise tests.TestNotApplicable('pure-python compressor'
1139
' does not handle compressor_settings')
1140
locations, old_block = self.make_block(self._texts)
1141
manager = groupcompress._LazyGroupContentManager(old_block,
1142
get_compressor_settings=lambda: dict(max_bytes_to_index=32))
1143
gc = manager._make_group_compressor()
1144
self.assertEqual(32, gc._delta_index._max_bytes_to_index)
1145
self.add_key_to_manager(('key3',), locations, old_block, manager)
1146
self.add_key_to_manager(('key4',), locations, old_block, manager)
1147
action, last_byte, total_bytes = manager._check_rebuild_action()
1148
self.assertEqual('rebuild', action)
1149
manager._rebuild_block()
1150
new_block = manager._block
1151
self.assertIsNot(old_block, new_block)
1152
# Because of the new max_bytes_to_index, we do a poor job of
1153
# rebuilding. This is a side-effect of the change, but at least it does
1154
# show the setting had an effect.
1155
self.assertTrue(old_block._content_length < new_block._content_length)
1157
def test_check_is_well_utilized_all_keys(self):
1158
block, manager = self.make_block_and_full_manager(self._texts)
1159
self.assertFalse(manager.check_is_well_utilized())
1160
# Though we can fake it by changing the recommended minimum size
1161
manager._full_enough_block_size = block._content_length
1162
self.assertTrue(manager.check_is_well_utilized())
1163
# Setting it just above causes it to fail
1164
manager._full_enough_block_size = block._content_length + 1
1165
self.assertFalse(manager.check_is_well_utilized())
1166
# Setting the mixed-block size doesn't do anything, because the content
1167
# is considered to not be 'mixed'
1168
manager._full_enough_mixed_block_size = block._content_length
1169
self.assertFalse(manager.check_is_well_utilized())
1171
def test_check_is_well_utilized_mixed_keys(self):
1177
texts[f1k1] = self._texts[('key1',)]
1178
texts[f1k2] = self._texts[('key2',)]
1179
texts[f2k1] = self._texts[('key3',)]
1180
texts[f2k2] = self._texts[('key4',)]
1181
block, manager = self.make_block_and_full_manager(texts)
1182
self.assertFalse(manager.check_is_well_utilized())
1183
manager._full_enough_block_size = block._content_length
1184
self.assertTrue(manager.check_is_well_utilized())
1185
manager._full_enough_block_size = block._content_length + 1
1186
self.assertFalse(manager.check_is_well_utilized())
1187
manager._full_enough_mixed_block_size = block._content_length
1188
self.assertTrue(manager.check_is_well_utilized())
1190
def test_check_is_well_utilized_partial_use(self):
1191
locations, block = self.make_block(self._texts)
1192
manager = groupcompress._LazyGroupContentManager(block)
1193
manager._full_enough_block_size = block._content_length
1194
self.add_key_to_manager(('key1',), locations, block, manager)
1195
self.add_key_to_manager(('key2',), locations, block, manager)
1196
# Just using the content from key1 and 2 is not enough to be considered
1198
self.assertFalse(manager.check_is_well_utilized())
1199
# However if we add key3, then we have enough, as we only require 75%
1201
self.add_key_to_manager(('key4',), locations, block, manager)
1202
self.assertTrue(manager.check_is_well_utilized())
1205
class Test_GCBuildDetails(tests.TestCase):
1207
def test_acts_like_tuple(self):
1208
# _GCBuildDetails inlines some of the data that used to be spread out
1209
# across a bunch of tuples
1210
bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
1211
('INDEX', 10, 20, 0, 5))
1212
self.assertEqual(4, len(bd))
1213
self.assertEqual(('INDEX', 10, 20, 0, 5), bd[0])
1214
self.assertEqual(None, bd[1]) # Compression Parent is always None
1215
self.assertEqual((('parent1',), ('parent2',)), bd[2])
1216
self.assertEqual(('group', None), bd[3]) # Record details
1218
def test__repr__(self):
1219
bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
1220
('INDEX', 10, 20, 0, 5))
1221
self.assertEqual("_GCBuildDetails(('INDEX', 10, 20, 0, 5),"
1222
" (('parent1',), ('parent2',)))",