1
# Copyright (C) 2005, 2006 by Canonical Ltd
1
# Copyright (C) 2005, 2006 Canonical Ltd
2
2
# Written by Robert Collins <robert.collins@canonical.com>
4
4
# This program is free software; you can redistribute it and/or modify
18
18
"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
20
from cStringIO import StringIO
20
22
# make GzipFile faster:
22
24
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
27
__all__ = ["GzipFile"]
29
# we want a \n preserved, break on \n only splitlines.
32
__all__ = ["GzipFile", "bytes_to_gzip"]
35
def bytes_to_gzip(bytes, factory=zlib.compressobj,
36
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
37
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
39
"""Create a gzip file containing bytes and return its content."""
41
'\037\213' # self.fileobj.write('\037\213') # magic header
42
'\010' # self.fileobj.write('\010') # compression method
43
# fname = self.filename[:-3]
47
'\x00' # self.fileobj.write(chr(flags))
48
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
49
'\002' # self.fileobj.write('\002')
50
'\377' # self.fileobj.write('\377')
52
'' # self.fileobj.write(fname + '\000')
54
# using a compressobj avoids a small header and trailer that the compress()
55
# utility function adds.
56
compress = factory(level, method, width, mem, 0)
57
result.append(compress.compress(bytes))
58
result.append(compress.flush())
59
result.append(struct.pack("<L", LOWU32(crc32(bytes))))
60
# size may exceed 2GB, or even 4GB
61
result.append(struct.pack("<L", LOWU32(len(bytes))))
62
return ''.join(result)
30
65
class GzipFile(gzip.GzipFile):
60
95
self.extrasize += len_data
61
96
self.size += len_data
98
def _write_gzip_header(self):
99
"""A tuned version of gzip._write_gzip_header
101
We have some extra constrains that plain Gzip does not.
102
1) We want to write the whole blob at once. rather than multiple
103
calls to fileobj.write().
104
2) We never have a filename
105
3) We don't care about the time
108
'\037\213' # self.fileobj.write('\037\213') # magic header
109
'\010' # self.fileobj.write('\010') # compression method
110
# fname = self.filename[:-3]
114
'\x00' # self.fileobj.write(chr(flags))
115
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
116
'\002' # self.fileobj.write('\002')
117
'\377' # self.fileobj.write('\377')
119
'' # self.fileobj.write(fname + '\000')
63
122
def _read(self, size=1024):
64
123
# various optimisations:
65
124
# reduces lsprof count from 2500 to
92
151
self._add_read_data(self.decompress.flush())
93
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
152
if len(self.decompress.unused_data) < 8:
153
raise AssertionError("what does flush do?")
154
self._gzip_tail = self.decompress.unused_data[0:8]
95
156
# tell the driving read() call we have stuffed all the data
96
157
# in self.extrabuf
112
173
if seek_length > 0:
113
174
# we read too much data
114
175
self.fileobj.seek(-seek_length, 1)
176
self._gzip_tail = self.decompress.unused_data[0:8]
115
177
elif seek_length < 0:
116
178
# we haven't read enough to check the checksum.
117
assert -8 < seek_length, "too great a seek."
179
if not (-8 < seek_length):
180
raise AssertionError("too great a seek")
118
181
buf = self.fileobj.read(-seek_length)
119
self.decompress.decompress(buf)
182
self._gzip_tail = self.decompress.unused_data + buf
184
self._gzip_tail = self.decompress.unused_data
121
186
# Check the CRC and file size, and set the flag so we read
122
187
# a new member on the next call
134
199
# We've read to the end of the file, so we should have 8 bytes of
135
# unused data in the decompressor. If we dont, there is a corrupt file.
200
# unused data in the decompressor. If we don't, there is a corrupt file.
136
201
# We use these 8 bytes to calculate the CRC and the recorded file size.
137
202
# We then check the that the computed CRC and size of the
138
203
# uncompressed data matches the stored values. Note that the size
139
204
# stored is the true file size mod 2**32.
140
crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])
205
if not (len(self._gzip_tail) == 8):
206
raise AssertionError("gzip trailer is incorrect length.")
207
crc32, isize = struct.unpack("<LL", self._gzip_tail)
141
208
# note that isize is unsigned - it can exceed 2GB
142
209
if crc32 != U32(self.crc):
143
210
raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))
248
315
# 4168 calls in 417.
249
316
# Negative numbers result in reading all the lines
252
content = self.read(sizehint)
253
return content.splitlines(True)
318
# python's gzip routine uses sizehint. This is a more efficient way
319
# than python uses to honor it. But it is even more efficient to
320
# just read the entire thing and use cStringIO to split into lines.
323
# content = self.read(sizehint)
324
# return bzrlib.osutils.split_lines(content)
325
content = StringIO(self.read(-1))
326
return content.readlines()
255
328
def _unread(self, buf, len_buf=None):
256
329
"""tuned to remove unneeded len calls.