14
14
# You should have received a copy of the GNU General Public License
15
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
18
"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
22
22
# make GzipFile faster:
24
from gzip import FEXTRA, FCOMMENT, FNAME, FHCRC
24
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
29
29
# we want a \n preserved, break on \n only splitlines.
30
from bzrlib import symbol_versioning
32
__all__ = ["GzipFile", "bytes_to_gzip"]
36
"""Return i as an unsigned integer, assuming it fits in 32 bits.
38
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
46
"""Return the low-order 32 bits of an int, as a non-negative int."""
47
return i & 0xFFFFFFFFL
50
def bytes_to_gzip(bytes, factory=zlib.compressobj,
51
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
52
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
54
"""Create a gzip file containing bytes and return its content."""
55
return chunks_to_gzip([bytes])
58
def chunks_to_gzip(chunks, factory=zlib.compressobj,
59
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
60
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
62
"""Create a gzip file containing chunks and return its content.
64
:param chunks: An iterable of strings. Each string can have arbitrary
68
'\037\213' # self.fileobj.write('\037\213') # magic header
69
'\010' # self.fileobj.write('\010') # compression method
70
# fname = self.filename[:-3]
74
'\x00' # self.fileobj.write(chr(flags))
75
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
76
'\002' # self.fileobj.write('\002')
77
'\377' # self.fileobj.write('\377')
79
'' # self.fileobj.write(fname + '\000')
81
# using a compressobj avoids a small header and trailer that the compress()
82
# utility function adds.
83
compress = factory(level, method, width, mem, 0)
87
crc = crc32(chunk, crc)
88
total_len += len(chunk)
89
zbytes = compress.compress(chunk)
92
result.append(compress.flush())
93
# size may exceed 2GB, or even 4GB
94
result.append(struct.pack("<LL", LOWU32(crc), LOWU32(total_len)))
95
return ''.join(result)
32
__all__ = ["GzipFile"]
98
35
class GzipFile(gzip.GzipFile):
118
55
Yes, its only 1.6 seconds, but they add up.
121
def __init__(self, *args, **kwargs):
122
symbol_versioning.warn(
123
symbol_versioning.deprecated_in((2, 3, 0))
124
% 'bzrlib.tuned_gzip.GzipFile',
125
DeprecationWarning, stacklevel=2)
126
gzip.GzipFile.__init__(self, *args, **kwargs)
128
58
def _add_read_data(self, data):
129
59
# 4169 calls in 183
130
60
# temp var for len(data) and switch to +='s.
139
69
"""A tuned version of gzip._write_gzip_header
141
71
We have some extra constrains that plain Gzip does not.
142
1) We want to write the whole blob at once. rather than multiple
72
1) We want to write the whole blob at once. rather than multiple
143
73
calls to fileobj.write().
144
74
2) We never have a filename
145
75
3) We don't care about the time
191
121
self._add_read_data(self.decompress.flush())
192
if len(self.decompress.unused_data) < 8:
193
raise AssertionError("what does flush do?")
122
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
194
123
self._gzip_tail = self.decompress.unused_data[0:8]
196
125
# tell the driving read() call we have stuffed all the data
216
145
self._gzip_tail = self.decompress.unused_data[0:8]
217
146
elif seek_length < 0:
218
147
# we haven't read enough to check the checksum.
219
if not (-8 < seek_length):
220
raise AssertionError("too great a seek")
148
assert -8 < seek_length, "too great a seek."
221
149
buf = self.fileobj.read(-seek_length)
222
150
self._gzip_tail = self.decompress.unused_data + buf
232
160
"""tuned to reduce function calls and eliminate file seeking:
234
162
reduces lsprof count from 800 to 288
236
164
avoid U32 call by using struct format L
239
# We've read to the end of the file, so we should have 8 bytes of
167
# We've read to the end of the file, so we should have 8 bytes of
240
168
# unused data in the decompressor. If we don't, there is a corrupt file.
241
169
# We use these 8 bytes to calculate the CRC and the recorded file size.
242
170
# We then check the that the computed CRC and size of the
243
171
# uncompressed data matches the stored values. Note that the size
244
172
# stored is the true file size mod 2**32.
245
if not (len(self._gzip_tail) == 8):
246
raise AssertionError("gzip trailer is incorrect length.")
173
assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
247
174
crc32, isize = struct.unpack("<LL", self._gzip_tail)
248
175
# note that isize is unsigned - it can exceed 2GB
249
176
if crc32 != U32(self.crc):
368
295
def _unread(self, buf, len_buf=None):
369
296
"""tuned to remove unneeded len calls.
371
298
because this is such an inner routine in readline, and readline is
372
299
in many inner loops, this has been inlined into readline().
374
301
The len_buf parameter combined with the reduction in len calls dropped
375
the lsprof ms count for this routine on my test data from 800 to 200 -
302
the lsprof ms count for this routine on my test data from 800 to 200 -
378
305
if len_buf is None:
396
323
self.offset += data_len
398
325
def writelines(self, lines):
399
# profiling indicated a significant overhead
326
# profiling indicated a significant overhead
400
327
# calling write for each line.
401
328
# this batch call is a lot faster :).
402
329
# (4 seconds to 1 seconds for the sample upgrades I was testing).
403
330
self.write(''.join(lines))
405
if sys.version_info > (2, 7):
406
# As of Python 2.7 the crc32 must be positive when close is called
408
if self.fileobj is None:
410
if self.mode == gzip.WRITE:
411
self.crc &= 0xFFFFFFFFL
412
gzip.GzipFile.close(self)