~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to tests/test_groupcompress.py

Another disk-format bump.

Move the labels/sha1 information into a pre-header. This also makes it
easier to decide to enable/disable the headers, as we can support
both with the same deserialising code (at least until we remove
the extra info from the indexes.)

This also makes a fulltext record stream start with 'f' and a delta
record stream start with 'd', which makes them more self describing.
The next step would probably be to write the base128 length of the
encoded bytes, which would make them fully independent, though
you wouldn't know what content they refer to.

This also brings in an update to .compress() which allows us to
see that we overflowed our group, roll back and start a new one.
This seems to give better compression in a 'more stable' manner.
Still open to tweaking, though.

Also introduce the 'gcc-chk255-big' which uses 64k leaf pages
rather than 4k leaf pages. Initial results show smaller compressed
size at a small (10%) increase in uncompressed size. Also shows
a full level decrease in the tree depth.

No-labels decreases the inv size approx 300kB, and big-page decreases
the inv size another 300kB, not to mention the 116k decrease in the
.cix index, just from not having the extra pages.

Having both no-labels and big inv pages brings a total drop of
11023k down to 9847k for the repo (1176kB savings, or 10% overall).

For now, leave the default with labels, but consider changing it.

Show diffs side-by-side

added added

removed removed

Lines of Context:
49
49
    return standard_tests
50
50
 
51
51
 
52
 
class TestGroupCompressor(TestCaseWithTransport):
 
52
class TestGroupCompressor(tests.TestCase):
53
53
    """Tests for GroupCompressor"""
54
54
 
55
55
    def test_empty_delta(self):
59
59
    def test_one_nosha_delta(self):
60
60
        # diff against NUKK
61
61
        compressor = groupcompress.GroupCompressor(True)
62
 
        sha1, end_point = compressor.compress(('label',),
 
62
        sha1, end_point, _, _ = compressor.compress(('label',),
63
63
            'strange\ncommon\n', None)
64
64
        self.assertEqual(sha_string('strange\ncommon\n'), sha1)
65
65
        expected_lines = [
66
 
            'fulltext\n',
67
 
            'label:label\nsha1:%s\n' % sha1,
68
 
            'len:15\n',
69
66
            'strange\ncommon\n',
70
67
            ]
71
68
        self.assertEqual(expected_lines, compressor.lines)
88
85
 
89
86
    def test_two_nosha_delta(self):
90
87
        compressor = groupcompress.GroupCompressor(True)
91
 
        sha1_1, _ = compressor.compress(('label',),
 
88
        sha1_1, _, _, _ = compressor.compress(('label',),
92
89
            'strange\ncommon long line\nthat needs a 16 byte match\n', None)
93
90
        expected_lines = list(compressor.lines)
94
 
        sha1_2, end_point = compressor.compress(('newlabel',),
 
91
        sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
95
92
            'common long line\nthat needs a 16 byte match\ndifferent\n', None)
96
93
        self.assertEqual(sha_string('common long line\n'
97
94
                                    'that needs a 16 byte match\n'
98
95
                                    'different\n'), sha1_2)
99
96
        expected_lines.extend([
100
 
            'delta\n'
101
 
            'label:newlabel\n',
102
 
            'sha1:%s\n' % sha1_2,
103
 
            'len:16\n',
104
97
            # source and target length
105
 
            '\x7e\x36',
 
98
            '\x34\x36',
106
99
            # copy the line common
107
 
            '\x91\x52\x2c', #copy, offset 0x52, len 0x2c
 
100
            '\x91\x08\x2c', #copy, offset 0x08, len 0x2c
108
101
            # add the line different, and the trailing newline
109
102
            '\x0adifferent\n', # insert 10 bytes
110
103
            ])
115
108
        # The first interesting test: make a change that should use lines from
116
109
        # both parents.
117
110
        compressor = groupcompress.GroupCompressor(True)
118
 
        sha1_1, end_point = compressor.compress(('label',),
 
111
        sha1_1, end_point, _, _ = compressor.compress(('label',),
119
112
            'strange\ncommon very very long line\nwith some extra text\n', None)
120
 
        sha1_2, _ = compressor.compress(('newlabel',),
 
113
        sha1_2, _, _, _ = compressor.compress(('newlabel',),
121
114
            'different\nmoredifferent\nand then some more\n', None)
122
115
        expected_lines = list(compressor.lines)
123
 
        sha1_3, end_point = compressor.compress(('label3',),
 
116
        sha1_3, end_point, _, _ = compressor.compress(('label3',),
124
117
            'new\ncommon very very long line\nwith some extra text\n'
125
118
            'different\nmoredifferent\nand then some more\n',
126
119
            None)
129
122
                       'different\nmoredifferent\nand then some more\n'),
130
123
            sha1_3)
131
124
        expected_lines.extend([
132
 
            'delta\n',
133
 
            'label:label3\n',
134
 
            'sha1:%s\n' % sha1_3,
135
 
            'len:13\n',
136
 
            '\xfa\x01\x5f' # source and target length
 
125
            '\x63\x5f' # source and target length
137
126
            # insert new
138
127
            '\x03new',
139
128
            # Copy of first parent 'common' range
140
 
            '\x91\x51\x31' # copy, offset 0x51, 0x31 bytes
 
129
            '\x91\x07\x31' # copy, offset 0x07, 0x31 bytes
141
130
            # Copy of second parent 'different' range
142
 
            '\x91\xcf\x2b' # copy, offset 0xcf, 0x2b bytes
 
131
            '\x91\x38\x2b' # copy, offset 0x38, 0x2b bytes
143
132
            ])
144
133
        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
145
134
        self.assertEqual(sum(map(len, expected_lines)), end_point)
146
135
 
147
136
    def test_stats(self):
148
137
        compressor = groupcompress.GroupCompressor(True)
149
 
        compressor.compress(('label',), 'strange\ncommon\n', None)
 
138
        compressor.compress(('label',), 'strange\ncommon long line\n'
 
139
                                        'plus more text\n', None)
150
140
        compressor.compress(('newlabel',),
151
 
                            'common\ndifferent\nmoredifferent\n', None)
 
141
                            'common long line\nplus more text\n'
 
142
                            'different\nmoredifferent\n', None)
152
143
        compressor.compress(('label3',),
153
 
                            'new\ncommon\ndifferent\nmoredifferent\n', None)
154
 
        self.assertAlmostEqual(0.3, compressor.ratio(), 1)
 
144
                            'new\ncommon long line\nplus more text\n'
 
145
                            '\ndifferent\nmoredifferent\n', None)
 
146
        self.assertAlmostEqual(1.4, compressor.ratio(), 1)
155
147
 
156
148
    def test_extract_from_compressor(self):
157
149
        # Knit fetching will try to reconstruct texts locally which results in
158
150
        # reading something that is in the compressor stream already.
159
151
        compressor = groupcompress.GroupCompressor(True)
160
 
        sha_1,  _ = compressor.compress(('label',), 'strange\ncommon\n', None)
161
 
        sha_2, _ = compressor.compress(('newlabel',),
162
 
            'common\ndifferent\nmoredifferent\n', None)
 
152
        sha1_1, _, _, _ = compressor.compress(('label',),
 
153
            'strange\ncommon long line\nthat needs a 16 byte match\n', None)
 
154
        expected_lines = list(compressor.lines)
 
155
        sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
 
156
            'common long line\nthat needs a 16 byte match\ndifferent\n', None)
163
157
        # get the first out
164
 
        self.assertEqual((['strange\ncommon\n'], sha_1),
 
158
        self.assertEqual(('strange\ncommon long line\n'
 
159
                          'that needs a 16 byte match\n', sha1_1),
165
160
            compressor.extract(('label',)))
166
161
        # and the second
167
 
        self.assertEqual((['common\ndifferent\nmoredifferent\n'],
168
 
            sha_2), compressor.extract(('newlabel',)))
 
162
        self.assertEqual(('common long line\nthat needs a 16 byte match\n'
 
163
                          'different\n', sha1_2),
 
164
                         compressor.extract(('newlabel',)))
 
165
 
 
166
 
 
167
class TestBase128Int(tests.TestCase):
 
168
 
 
169
    def assertEqualEncode(self, bytes, val):
 
170
        self.assertEqual(bytes, groupcompress.encode_base128_int(val))
 
171
 
 
172
    def assertEqualDecode(self, val, num_decode, bytes):
 
173
        self.assertEqual((val, num_decode),
 
174
                         groupcompress.decode_base128_int(bytes))
 
175
 
 
176
    def test_encode(self):
 
177
        self.assertEqualEncode('\x01', 1)
 
178
        self.assertEqualEncode('\x02', 2)
 
179
        self.assertEqualEncode('\x7f', 127)
 
180
        self.assertEqualEncode('\x80\x01', 128)
 
181
        self.assertEqualEncode('\xff\x01', 255)
 
182
        self.assertEqualEncode('\x80\x02', 256)
 
183
        self.assertEqualEncode('\xff\xff\xff\xff\x0f', 0xFFFFFFFF)
 
184
 
 
185
    def test_decode(self):
 
186
        self.assertEqualDecode(1, 1, '\x01')
 
187
        self.assertEqualDecode(2, 1, '\x02')
 
188
        self.assertEqualDecode(127, 1, '\x7f')
 
189
        self.assertEqualDecode(128, 2, '\x80\x01')
 
190
        self.assertEqualDecode(255, 2, '\xff\x01')
 
191
        self.assertEqualDecode(256, 2, '\x80\x02')
 
192
        self.assertEqualDecode(0xFFFFFFFF, 5, '\xff\xff\xff\xff\x0f')
 
193
 
 
194
    def test_decode_with_trailing_bytes(self):
 
195
        self.assertEqualDecode(1, 1, '\x01abcdef')
 
196
        self.assertEqualDecode(127, 1, '\x7f\x01')
 
197
        self.assertEqualDecode(128, 2, '\x80\x01abcdef')
 
198
        self.assertEqualDecode(255, 2, '\xff\x01\xff')
 
199
 
 
200
 
 
201
class TestGroupCompressBlock(tests.TestCase):
 
202
 
 
203
    def test_from_empty_bytes(self):
 
204
        self.assertRaises(errors.InvalidGroupCompressBlock,
 
205
                          groupcompress.GroupCompressBlock.from_bytes, '')
 
206
 
 
207
    def test_from_minimal_bytes(self):
 
208
        block = groupcompress.GroupCompressBlock.from_bytes('gcb1z\n0\n0\n')
 
209
        self.assertIsInstance(block, groupcompress.GroupCompressBlock)
 
210
        self.assertEqual({}, block._entries)
 
211
 
 
212
    def test_from_bytes(self):
 
213
        z_header_bytes = (
 
214
            'gcb1z\n' # group compress block v1 plain
 
215
            '76\n' # Length of zlib bytes
 
216
            '183\n' # Length of all meta-info
 
217
            + zlib.compress(
 
218
            'key:bing\n'
 
219
            'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
 
220
            'type:fulltext\n'
 
221
            'start:100\n'
 
222
            'length:100\n'
 
223
            '\n'
 
224
            'key:foo\x00bar\n'
 
225
            'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
 
226
            'type:fulltext\n'
 
227
            'start:0\n'
 
228
            'length:100\n'
 
229
            '\n'))
 
230
        block = groupcompress.GroupCompressBlock.from_bytes(
 
231
            z_header_bytes)
 
232
        self.assertIs(None, block._content)
 
233
        self.assertIsInstance(block, groupcompress.GroupCompressBlock)
 
234
        self.assertEqual([('bing',), ('foo', 'bar')], sorted(block._entries))
 
235
        bing = block._entries[('bing',)]
 
236
        self.assertEqual(('bing',), bing.key)
 
237
        self.assertEqual('fulltext', bing.type)
 
238
        self.assertEqual('abcd'*10, bing.sha1)
 
239
        self.assertEqual(100, bing.start)
 
240
        self.assertEqual(100, bing.length)
 
241
        foobar = block._entries[('foo', 'bar')]
 
242
        self.assertEqual(('foo', 'bar'), foobar.key)
 
243
        self.assertEqual('fulltext', foobar.type)
 
244
        self.assertEqual('abcd'*10, foobar.sha1)
 
245
        self.assertEqual(0, foobar.start)
 
246
        self.assertEqual(100, foobar.length)
 
247
 
 
248
    def test_add_entry(self):
 
249
        gcb = groupcompress.GroupCompressBlock()
 
250
        e = gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
 
251
        self.assertIsInstance(e, groupcompress.GroupCompressBlockEntry)
 
252
        self.assertEqual(('foo', 'bar'), e.key)
 
253
        self.assertEqual('fulltext', e.type)
 
254
        self.assertEqual('abcd'*10, e.sha1)
 
255
        self.assertEqual(0, e.start)
 
256
        self.assertEqual(100, e.length)
 
257
 
 
258
    def test_to_bytes(self):
 
259
        gcb = groupcompress.GroupCompressBlock()
 
260
        gcb.add_entry(('foo', 'bar'), 'fulltext', 'abcd'*10, 0, 100)
 
261
        gcb.add_entry(('bing',), 'fulltext', 'abcd'*10, 100, 100)
 
262
        bytes = gcb.to_bytes()
 
263
        self.assertStartsWith(bytes,
 
264
                              'gcb1z\n' # group compress block v1 zlib
 
265
                              '77\n' # Length of compressed bytes
 
266
                              '183\n' # Length of all meta-info
 
267
                             )
 
268
        remaining_bytes = bytes[13:]
 
269
        raw_bytes = zlib.decompress(remaining_bytes)
 
270
        self.assertEqualDiff('key:bing\n'
 
271
                             'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
 
272
                             'type:fulltext\n'
 
273
                             'start:100\n'
 
274
                             'length:100\n'
 
275
                             '\n'
 
276
                             'key:foo\x00bar\n'
 
277
                             'sha1:abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd\n'
 
278
                             'type:fulltext\n'
 
279
                             'start:0\n'
 
280
                             'length:100\n'
 
281
                             '\n', raw_bytes)