21
21
from zlib import Z_FINISH, Z_SYNC_FLUSH
23
# [max_repack, buffer_full, repacks_with_space, min_compression,
24
# total_bytes_in, total_bytes_out, avg_comp,
25
# bytes_autopack, bytes_sync_packed, num_full_by_zsync]
26
_stats = [0, 0, 0, 999, 0, 0, 0, 0, 0, 0]
28
24
class ChunkWriter(object):
29
25
"""ChunkWriter allows writing of compressed data with a fixed size.
40
36
will sometimes start over and compress the whole list to get tighter
41
37
packing. We get diminishing returns after a while, so this limits the
42
38
number of times we will try.
43
In testing, some values for bzr.dev::
45
repack time MB max full
53
zsync time MB repack max_z time w/ add_node
61
10 6.8 5.0 260 967 5.3
62
11 6.8 4.9 366 839 5.3
63
12 6.9 4.8 454 731 5.1
64
15 7.2 4.7 704 450 5.8
67
In testing, some values for mysql-unpacked::
70
repack time MB hit_max full
72
2 54.4 13.7 3467 0 35.4
73
20 67.0 13.4 0 3380 46.7
76
zsync time w/ add_node
77
0 47.7 116.5 0 29782 29.5
78
1 48.5 60.2 0 15356 27.8
79
2 48.1 42.4 0 10822 27.8
80
5 48.3 25.5 0 6491 26.8
81
6 48.0 23.2 13 5896 27.3
82
7 48.1 21.6 29 5451 27.5
83
8 48.1 20.3 52 5108 27.1
84
10 46.9 18.6 195 4526 29.4
85
11 48.8 18.0 421 4143 29.2
86
12 47.4 17.5 702 3738 28.0
87
15 49.6 16.5 1223 2969 28.9
88
20 48.9 15.7 2182 1810 29.6
39
The default is to try to avoid recompressing entirely, but setting this
40
to something like 20 will give maximum compression.
42
:cvar _max_zsync: Another tunable nob. If _max_repack is set to 0, then you
43
can limit the number of times we will try to pack more data into a
44
node. This allows us to do a single compression pass, rather than
45
trying until we overflow, and then recompressing again.
47
# In testing, some values for bzr.dev::
48
# repack time MB max full
50
# 2 8.4 4.2 1036 1 6.8
55
# zsync time MB repack max_z time w/ add_node
56
# 0 6.7 24.7 0 6270 5.0
57
# 1 6.5 13.2 0 3342 4.3
58
# 2 6.6 9.6 0 2414 4.9
59
# 5 6.5 6.2 0 1549 4.8
60
# 6 6.5 5.8 1 1435 4.8
61
# 7 6.6 5.5 19 1337 4.8
62
# 8 6.7 5.3 81 1220 4.4
63
# 10 6.8 5.0 260 967 5.3
64
# 11 6.8 4.9 366 839 5.3
65
# 12 6.9 4.8 454 731 5.1
66
# 15 7.2 4.7 704 450 5.8
67
# 20 7.7 4.6 1133 7 5.8
69
# In testing, some values for mysql-unpacked::
71
# repack time MB hit_max full
73
# 2 54.4 13.7 3467 0 35.4
74
# 20 67.0 13.4 0 3380 46.7
76
# zsync time w/ add_node
77
# 0 47.7 116.5 0 29782 29.5
78
# 1 48.5 60.2 0 15356 27.8
79
# 2 48.1 42.4 0 10822 27.8
80
# 5 48.3 25.5 0 6491 26.8
81
# 6 48.0 23.2 13 5896 27.3
82
# 7 48.1 21.6 29 5451 27.5
83
# 8 48.1 20.3 52 5108 27.1
84
# 10 46.9 18.6 195 4526 29.4
85
# 11 48.8 18.0 421 4143 29.2
86
# 12 47.4 17.5 702 3738 28.0
87
# 15 49.6 16.5 1223 2969 28.9
88
# 20 48.9 15.7 2182 1810 29.6
89
# 30 15.4 3891 23 31.4
125
124
out = self.compressor.flush(Z_FINISH)
126
125
self.bytes_list.append(out)
127
126
self.bytes_out_len += len(out)
128
if self.num_repack > 0 and self.bytes_out_len > 0:
129
comp = float(self.seen_bytes) / self.bytes_out_len
132
_stats[4] += self.seen_bytes
133
_stats[5] += self.bytes_out_len
134
_stats[6] = float(_stats[4]) / _stats[5]
136
if self._max_repack == 0 and self.num_repack == 1:
139
128
if self.bytes_out_len > self.chunk_size:
140
129
raise AssertionError('Somehow we ended up with too much'
204
193
self.bytes_in.append(bytes)
205
194
self.seen_bytes += len(bytes)
206
195
self.unflushed_in_bytes += len(bytes)
207
_stats[7] += 1 # len(bytes)
209
197
# This may or may not fit, try to add it with Z_SYNC_FLUSH
210
_stats[8] += 1 # len(bytes)
211
198
# Note: It is tempting to do this as a look-ahead pass, and to
212
199
# 'copy()' the compressor before flushing. However, it seems that
213
200
# 'flush()' is when the compressor actually does most work
248
235
# When we get *to* _max_repack, bump over so that the
249
236
# earlier > _max_repack will be triggered.
250
237
self.num_repack += 1
252
238
if this_len + 10 > capacity:
253
239
(bytes_out, this_len,
254
240
compressor) = self._recompress_all_bytes_in()
256
241
self.compressor = compressor
257
242
# Force us to not allow more data
258
243
self.num_repack = self._max_repack + 1
264
249
# This fits when we pack it tighter, so use the new packing
265
250
# There is one Z_SYNC_FLUSH call in
266
251
# _recompress_all_bytes_in
268
252
self.compressor = compressor
269
253
self.bytes_in.append(bytes)
270
254
self.bytes_list = bytes_out