1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
2
# Written by Robert Collins <robert.collins@canonical.com>
4
4
# This program is free software; you can redistribute it and/or modify
18
18
"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
20
from cStringIO import StringIO
22
20
# make GzipFile faster:
24
from gzip import FEXTRA, FCOMMENT, FNAME, FHCRC
22
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
29
27
# we want a \n preserved, break on \n only splitlines.
32
__all__ = ["GzipFile", "bytes_to_gzip"]
36
"""Return i as an unsigned integer, assuming it fits in 32 bits.
38
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
46
"""Return the low-order 32 bits of an int, as a non-negative int."""
47
return i & 0xFFFFFFFFL
50
def bytes_to_gzip(bytes, factory=zlib.compressobj,
51
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
52
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
54
"""Create a gzip file containing bytes and return its content."""
56
'\037\213' # self.fileobj.write('\037\213') # magic header
57
'\010' # self.fileobj.write('\010') # compression method
58
# fname = self.filename[:-3]
62
'\x00' # self.fileobj.write(chr(flags))
63
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
64
'\002' # self.fileobj.write('\002')
65
'\377' # self.fileobj.write('\377')
67
'' # self.fileobj.write(fname + '\000')
69
# using a compressobj avoids a small header and trailer that the compress()
70
# utility function adds.
71
compress = factory(level, method, width, mem, 0)
72
result.append(compress.compress(bytes))
73
result.append(compress.flush())
74
result.append(struct.pack("<L", LOWU32(crc32(bytes))))
75
# size may exceed 2GB, or even 4GB
76
result.append(struct.pack("<L", LOWU32(len(bytes))))
77
return ''.join(result)
30
__all__ = ["GzipFile"]
80
33
class GzipFile(gzip.GzipFile):
110
63
self.extrasize += len_data
111
64
self.size += len_data
113
def _write_gzip_header(self):
114
"""A tuned version of gzip._write_gzip_header
116
We have some extra constrains that plain Gzip does not.
117
1) We want to write the whole blob at once. rather than multiple
118
calls to fileobj.write().
119
2) We never have a filename
120
3) We don't care about the time
123
'\037\213' # self.fileobj.write('\037\213') # magic header
124
'\010' # self.fileobj.write('\010') # compression method
125
# fname = self.filename[:-3]
129
'\x00' # self.fileobj.write(chr(flags))
130
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
131
'\002' # self.fileobj.write('\002')
132
'\377' # self.fileobj.write('\377')
134
'' # self.fileobj.write(fname + '\000')
137
66
def _read(self, size=1024):
138
67
# various optimisations:
139
68
# reduces lsprof count from 2500 to
166
95
self._add_read_data(self.decompress.flush())
167
if len(self.decompress.unused_data) < 8:
168
raise AssertionError("what does flush do?")
96
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
169
97
self._gzip_tail = self.decompress.unused_data[0:8]
171
99
# tell the driving read() call we have stuffed all the data
191
119
self._gzip_tail = self.decompress.unused_data[0:8]
192
120
elif seek_length < 0:
193
121
# we haven't read enough to check the checksum.
194
if not (-8 < seek_length):
195
raise AssertionError("too great a seek")
122
assert -8 < seek_length, "too great a seek."
196
123
buf = self.fileobj.read(-seek_length)
197
124
self._gzip_tail = self.decompress.unused_data + buf
214
141
# We've read to the end of the file, so we should have 8 bytes of
215
# unused data in the decompressor. If we don't, there is a corrupt file.
142
# unused data in the decompressor. If we dont, there is a corrupt file.
216
143
# We use these 8 bytes to calculate the CRC and the recorded file size.
217
144
# We then check the that the computed CRC and size of the
218
145
# uncompressed data matches the stored values. Note that the size
219
146
# stored is the true file size mod 2**32.
220
if not (len(self._gzip_tail) == 8):
221
raise AssertionError("gzip trailer is incorrect length.")
147
assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
222
148
crc32, isize = struct.unpack("<LL", self._gzip_tail)
223
149
# note that isize is unsigned - it can exceed 2GB
224
150
if crc32 != U32(self.crc):
330
256
# 4168 calls in 417.
331
257
# Negative numbers result in reading all the lines
333
# python's gzip routine uses sizehint. This is a more efficient way
334
# than python uses to honor it. But it is even more efficient to
335
# just read the entire thing and use cStringIO to split into lines.
338
# content = self.read(sizehint)
339
# return bzrlib.osutils.split_lines(content)
340
content = StringIO(self.read(-1))
341
return content.readlines()
260
content = self.read(sizehint)
261
return bzrlib.osutils.split_lines(content)
343
263
def _unread(self, buf, len_buf=None):
344
264
"""tuned to remove unneeded len calls.