1
# Copyright (C) 2005, 2006 by Canonical Ltd
1
# Copyright (C) 2005, 2006 Canonical Ltd
2
2
# Written by Robert Collins <robert.collins@canonical.com>
4
4
# This program is free software; you can redistribute it and/or modify
18
18
"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
20
from cStringIO import StringIO
20
22
# make GzipFile faster:
22
24
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
27
29
# we want a \n preserved, break on \n only splitlines.
30
__all__ = ["GzipFile"]
32
__all__ = ["GzipFile", "bytes_to_gzip"]
35
def bytes_to_gzip(bytes, factory=zlib.compressobj,
36
level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
37
width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
39
"""Create a gzip file containing bytes and return its content."""
41
'\037\213' # self.fileobj.write('\037\213') # magic header
42
'\010' # self.fileobj.write('\010') # compression method
43
# fname = self.filename[:-3]
47
'\x00' # self.fileobj.write(chr(flags))
48
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
49
'\002' # self.fileobj.write('\002')
50
'\377' # self.fileobj.write('\377')
52
'' # self.fileobj.write(fname + '\000')
54
# using a compressobj avoids a small header and trailer that the compress()
55
# utility function adds.
56
compress = factory(level, method, width, mem, 0)
57
result.append(compress.compress(bytes))
58
result.append(compress.flush())
59
result.append(struct.pack("<L", LOWU32(crc32(bytes))))
60
# size may exceed 2GB, or even 4GB
61
result.append(struct.pack("<L", LOWU32(len(bytes))))
62
return ''.join(result)
33
65
class GzipFile(gzip.GzipFile):
63
95
self.extrasize += len_data
64
96
self.size += len_data
98
def _write_gzip_header(self):
99
"""A tuned version of gzip._write_gzip_header
101
We have some extra constrains that plain Gzip does not.
102
1) We want to write the whole blob at once. rather than multiple
103
calls to fileobj.write().
104
2) We never have a filename
105
3) We don't care about the time
108
'\037\213' # self.fileobj.write('\037\213') # magic header
109
'\010' # self.fileobj.write('\010') # compression method
110
# fname = self.filename[:-3]
114
'\x00' # self.fileobj.write(chr(flags))
115
'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
116
'\002' # self.fileobj.write('\002')
117
'\377' # self.fileobj.write('\377')
119
'' # self.fileobj.write(fname + '\000')
66
122
def _read(self, size=1024):
67
123
# various optimisations:
68
124
# reduces lsprof count from 2500 to
95
151
self._add_read_data(self.decompress.flush())
96
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
152
if len(self.decompress.unused_data) < 8:
153
raise AssertionError("what does flush do?")
97
154
self._gzip_tail = self.decompress.unused_data[0:8]
99
156
# tell the driving read() call we have stuffed all the data
119
176
self._gzip_tail = self.decompress.unused_data[0:8]
120
177
elif seek_length < 0:
121
178
# we haven't read enough to check the checksum.
122
assert -8 < seek_length, "too great a seek."
179
if not (-8 < seek_length):
180
raise AssertionError("too great a seek")
123
181
buf = self.fileobj.read(-seek_length)
124
182
self._gzip_tail = self.decompress.unused_data + buf
141
199
# We've read to the end of the file, so we should have 8 bytes of
142
# unused data in the decompressor. If we dont, there is a corrupt file.
200
# unused data in the decompressor. If we don't, there is a corrupt file.
143
201
# We use these 8 bytes to calculate the CRC and the recorded file size.
144
202
# We then check the that the computed CRC and size of the
145
203
# uncompressed data matches the stored values. Note that the size
146
204
# stored is the true file size mod 2**32.
147
assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
205
if not (len(self._gzip_tail) == 8):
206
raise AssertionError("gzip trailer is incorrect length.")
148
207
crc32, isize = struct.unpack("<LL", self._gzip_tail)
149
208
# note that isize is unsigned - it can exceed 2GB
150
209
if crc32 != U32(self.crc):
256
315
# 4168 calls in 417.
257
316
# Negative numbers result in reading all the lines
260
content = self.read(sizehint)
261
return bzrlib.osutils.split_lines(content)
318
# python's gzip routine uses sizehint. This is a more efficient way
319
# than python uses to honor it. But it is even more efficient to
320
# just read the entire thing and use cStringIO to split into lines.
323
# content = self.read(sizehint)
324
# return bzrlib.osutils.split_lines(content)
325
content = StringIO(self.read(-1))
326
return content.readlines()
263
328
def _unread(self, buf, len_buf=None):
264
329
"""tuned to remove unneeded len calls.