14
14
# You should have received a copy of the GNU General Public License
15
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
18
"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
22
22
# make GzipFile faster:
24
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
24
from gzip import FEXTRA, FCOMMENT, FNAME, FHCRC
29
29
# we want a \n preserved, break on \n only splitlines.
32
__all__ = ["GzipFile"]
30
from bzrlib import symbol_versioning
32
__all__ = ["GzipFile", "bytes_to_gzip"]
36
"""Return i as an unsigned integer, assuming it fits in 32 bits.
38
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
46
"""Return the low-order 32 bits of an int, as a non-negative int."""
47
return i & 0xFFFFFFFFL
50
def bytes_to_gzip(bytes, factory=zlib.compressobj,
51
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
52
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
54
"""Create a gzip file containing bytes and return its content."""
55
return chunks_to_gzip([bytes])
58
def chunks_to_gzip(chunks, factory=zlib.compressobj,
59
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
60
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
62
"""Create a gzip file containing chunks and return its content.
64
:param chunks: An iterable of strings. Each string can have arbitrary
68
'\037\213' # self.fileobj.write('\037\213') # magic header
69
'\010' # self.fileobj.write('\010') # compression method
70
# fname = self.filename[:-3]
74
'\x00' # self.fileobj.write(chr(flags))
75
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
76
'\002' # self.fileobj.write('\002')
77
'\377' # self.fileobj.write('\377')
79
'' # self.fileobj.write(fname + '\000')
81
# using a compressobj avoids a small header and trailer that the compress()
82
# utility function adds.
83
compress = factory(level, method, width, mem, 0)
87
crc = crc32(chunk, crc)
88
total_len += len(chunk)
89
zbytes = compress.compress(chunk)
92
result.append(compress.flush())
93
# size may exceed 2GB, or even 4GB
94
result.append(struct.pack("<LL", LOWU32(crc), LOWU32(total_len)))
95
return ''.join(result)
35
98
class GzipFile(gzip.GzipFile):
55
118
Yes, its only 1.6 seconds, but they add up.
121
def __init__(self, *args, **kwargs):
122
symbol_versioning.warn(
123
symbol_versioning.deprecated_in((2, 3, 0))
124
% 'bzrlib.tuned_gzip.GzipFile',
125
DeprecationWarning, stacklevel=2)
126
gzip.GzipFile.__init__(self, *args, **kwargs)
58
128
def _add_read_data(self, data):
59
129
# 4169 calls in 183
60
130
# temp var for len(data) and switch to +='s.
69
139
"""A tuned version of gzip._write_gzip_header
71
141
We have some extra constrains that plain Gzip does not.
72
1) We want to write the whole blob at once. rather than multiple
142
1) We want to write the whole blob at once. rather than multiple
73
143
calls to fileobj.write().
74
144
2) We never have a filename
75
145
3) We don't care about the time
121
191
self._add_read_data(self.decompress.flush())
122
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
192
if len(self.decompress.unused_data) < 8:
193
raise AssertionError("what does flush do?")
123
194
self._gzip_tail = self.decompress.unused_data[0:8]
125
196
# tell the driving read() call we have stuffed all the data
145
216
self._gzip_tail = self.decompress.unused_data[0:8]
146
217
elif seek_length < 0:
147
218
# we haven't read enough to check the checksum.
148
assert -8 < seek_length, "too great a seek."
219
if not (-8 < seek_length):
220
raise AssertionError("too great a seek")
149
221
buf = self.fileobj.read(-seek_length)
150
222
self._gzip_tail = self.decompress.unused_data + buf
160
232
"""tuned to reduce function calls and eliminate file seeking:
162
234
reduces lsprof count from 800 to 288
164
236
avoid U32 call by using struct format L
167
# We've read to the end of the file, so we should have 8 bytes of
239
# We've read to the end of the file, so we should have 8 bytes of
168
240
# unused data in the decompressor. If we don't, there is a corrupt file.
169
241
# We use these 8 bytes to calculate the CRC and the recorded file size.
170
242
# We then check the that the computed CRC and size of the
171
243
# uncompressed data matches the stored values. Note that the size
172
244
# stored is the true file size mod 2**32.
173
assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
245
if not (len(self._gzip_tail) == 8):
246
raise AssertionError("gzip trailer is incorrect length.")
174
247
crc32, isize = struct.unpack("<LL", self._gzip_tail)
175
248
# note that isize is unsigned - it can exceed 2GB
176
249
if crc32 != U32(self.crc):
295
368
def _unread(self, buf, len_buf=None):
296
369
"""tuned to remove unneeded len calls.
298
371
because this is such an inner routine in readline, and readline is
299
372
in many inner loops, this has been inlined into readline().
301
374
The len_buf parameter combined with the reduction in len calls dropped
302
the lsprof ms count for this routine on my test data from 800 to 200 -
375
the lsprof ms count for this routine on my test data from 800 to 200 -
305
378
if len_buf is None:
323
396
self.offset += data_len
325
398
def writelines(self, lines):
326
# profiling indicated a significant overhead
399
# profiling indicated a significant overhead
327
400
# calling write for each line.
328
401
# this batch call is a lot faster :).
329
402
# (4 seconds to 1 seconds for the sample upgrades I was testing).
330
403
self.write(''.join(lines))
405
if sys.version_info > (2, 7):
406
# As of Python 2.7 the crc32 must be positive when close is called
408
if self.fileobj is None:
410
if self.mode == gzip.WRITE:
411
self.crc &= 0xFFFFFFFFL
412
gzip.GzipFile.close(self)