~bzr-pqm/bzr/bzr.dev : contents of bzrlib/tuned

~bzr-pqm/bzr/bzr.dev : (revision 3316.2.13)

# Copyright (C) 2005, 2006 Canonical Ltd
# Written by Robert Collins <robert.collins@canonical.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

from cStringIO import StringIO

# make GzipFile faster:
import gzip
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
import sys
import struct
import zlib

# we want a \n preserved, break on \n only splitlines.
import bzrlib

__all__ = ["GzipFile", "bytes_to_gzip"]


def bytes_to_gzip(bytes, factory=zlib.compressobj,
    level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
    width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
    crc32=zlib.crc32):
    """Create a gzip file containing bytes and return its content."""
    result = [
        '\037\213'  # self.fileobj.write('\037\213')  # magic header
        '\010'      # self.fileobj.write('\010')      # compression method
                    # fname = self.filename[:-3]
                    # flags = 0
                    # if fname:
                    #     flags = FNAME
        '\x00'      # self.fileobj.write(chr(flags))
        '\0\0\0\0'  # write32u(self.fileobj, long(time.time()))
        '\002'      # self.fileobj.write('\002')
        '\377'      # self.fileobj.write('\377')
                    # if fname:
        ''          #     self.fileobj.write(fname + '\000')
        ]
    # using a compressobj avoids a small header and trailer that the compress()
    # utility function adds.
    compress = factory(level, method, width, mem, 0)
    result.append(compress.compress(bytes))
    result.append(compress.flush())
    result.append(struct.pack("<L", LOWU32(crc32(bytes))))
    # size may exceed 2GB, or even 4GB
    result.append(struct.pack("<L", LOWU32(len(bytes))))
    return ''.join(result)


class GzipFile(gzip.GzipFile):
    """Knit tuned version of GzipFile.

    This is based on the following lsprof stats:
    python 2.4 stock GzipFile write:
    58971      0   5644.3090   2721.4730   gzip:193(write)
    +58971     0   1159.5530   1159.5530   +<built-in method compress>
    +176913    0    987.0320    987.0320   +<len>
    +58971     0    423.1450    423.1450   +<zlib.crc32>
    +58971     0    353.1060    353.1060   +<method 'write' of 'cStringIO.
                                            StringO' objects>
    tuned GzipFile write:
    58971      0   4477.2590   2103.1120   bzrlib.knit:1250(write)
    +58971     0   1297.7620   1297.7620   +<built-in method compress>
    +58971     0    406.2160    406.2160   +<zlib.crc32>
    +58971     0    341.9020    341.9020   +<method 'write' of 'cStringIO.
                                            StringO' objects>
    +58971     0    328.2670    328.2670   +<len>


    Yes, its only 1.6 seconds, but they add up.
    """

    def _add_read_data(self, data):
        # 4169 calls in 183
        # temp var for len(data) and switch to +='s.
        # 4169 in 139
        len_data = len(data)
        self.crc = zlib.crc32(data, self.crc)
        self.extrabuf += data
        self.extrasize += len_data
        self.size += len_data

    def _write_gzip_header(self):
        """A tuned version of gzip._write_gzip_header

        We have some extra constrains that plain Gzip does not.
        1) We want to write the whole blob at once. rather than multiple 
           calls to fileobj.write().
        2) We never have a filename
        3) We don't care about the time
        """
        self.fileobj.write(
           '\037\213'   # self.fileobj.write('\037\213')  # magic header
            '\010'      # self.fileobj.write('\010')      # compression method
                        # fname = self.filename[:-3]
                        # flags = 0
                        # if fname:
                        #     flags = FNAME
            '\x00'      # self.fileobj.write(chr(flags))
            '\0\0\0\0'  # write32u(self.fileobj, long(time.time()))
            '\002'      # self.fileobj.write('\002')
            '\377'      # self.fileobj.write('\377')
                        # if fname:
            ''          #     self.fileobj.write(fname + '\000')
            )

    def _read(self, size=1024):
        # various optimisations:
        # reduces lsprof count from 2500 to 
        # 8337 calls in 1272, 365 internal
        if self.fileobj is None:
            raise EOFError, "Reached EOF"

        if self._new_member:
            # If the _new_member flag is set, we have to
            # jump to the next member, if there is one.
            #
            # First, check if we're at the end of the file;
            # if so, it's time to stop; no more members to read.
            next_header_bytes = self.fileobj.read(10)
            if next_header_bytes == '':
                raise EOFError, "Reached EOF"

            self._init_read()
            self._read_gzip_header(next_header_bytes)
            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
            self._new_member = False

        # Read a chunk of data from the file
        buf = self.fileobj.read(size)

        # If the EOF has been reached, flush the decompression object
        # and mark this object as finished.

        if buf == "":
            self._add_read_data(self.decompress.flush())
            assert len(self.decompress.unused_data) >= 8, "what does flush do?"
            self._gzip_tail = self.decompress.unused_data[0:8]
            self._read_eof()
            # tell the driving read() call we have stuffed all the data
            # in self.extrabuf
            raise EOFError, 'Reached EOF'

        self._add_read_data(self.decompress.decompress(buf))

        if self.decompress.unused_data != "":
            # Ending case: we've come to the end of a member in the file,
            # so seek back to the start of the data for the next member which
            # is the length of the decompress objects unused data - the first
            # 8 bytes for the end crc and size records.
            #
            # so seek back to the start of the unused data, finish up
            # this member, and read a new gzip header.
            # (The number of bytes to seek back is the length of the unused
            # data, minus 8 because those 8 bytes are part of this member.
            seek_length = len (self.decompress.unused_data) - 8
            if seek_length > 0:
                # we read too much data
                self.fileobj.seek(-seek_length, 1)
                self._gzip_tail = self.decompress.unused_data[0:8]
            elif seek_length < 0:
                # we haven't read enough to check the checksum.
                assert -8 < seek_length, "too great a seek."
                buf = self.fileobj.read(-seek_length)
                self._gzip_tail = self.decompress.unused_data + buf
            else:
                self._gzip_tail = self.decompress.unused_data

            # Check the CRC and file size, and set the flag so we read
            # a new member on the next call
            self._read_eof()
            self._new_member = True

    def _read_eof(self):
        """tuned to reduce function calls and eliminate file seeking:
        pass 1:
        reduces lsprof count from 800 to 288
        4168 in 296 
        avoid U32 call by using struct format L
        4168 in 200
        """
        # We've read to the end of the file, so we should have 8 bytes of 
        # unused data in the decompressor. If we don't, there is a corrupt file.
        # We use these 8 bytes to calculate the CRC and the recorded file size.
        # We then check the that the computed CRC and size of the
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
        assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
        crc32, isize = struct.unpack("<LL", self._gzip_tail)
        # note that isize is unsigned - it can exceed 2GB
        if crc32 != U32(self.crc):
            raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))
        elif isize != LOWU32(self.size):
            raise IOError, "Incorrect length of data produced"

    def _read_gzip_header(self, bytes=None):
        """Supply bytes if the minimum header size is already read.
        
        :param bytes: 10 bytes of header data.
        """
        """starting cost: 300 in 3998
        15998 reads from 3998 calls
        final cost 168
        """
        if bytes is None:
            bytes = self.fileobj.read(10)
        magic = bytes[0:2]
        if magic != '\037\213':
            raise IOError, 'Not a gzipped file'
        method = ord(bytes[2:3])
        if method != 8:
            raise IOError, 'Unknown compression method'
        flag = ord(bytes[3:4])
        # modtime = self.fileobj.read(4) (bytes [4:8])
        # extraflag = self.fileobj.read(1) (bytes[8:9])
        # os = self.fileobj.read(1) (bytes[9:10])
        # self.fileobj.read(6)

        if flag & FEXTRA:
            # Read & discard the extra field, if present
            xlen = ord(self.fileobj.read(1))
            xlen = xlen + 256*ord(self.fileobj.read(1))
            self.fileobj.read(xlen)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while True:
                s = self.fileobj.read(1)
                if not s or s=='\000':
                    break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while True:
                s = self.fileobj.read(1)
                if not s or s=='\000':
                    break
        if flag & FHCRC:
            self.fileobj.read(2)     # Read & discard the 16-bit header CRC

    def readline(self, size=-1):
        """Tuned to remove buffer length calls in _unread and...
        
        also removes multiple len(c) calls, inlines _unread,
        total savings - lsprof 5800 to 5300
        phase 2:
        4168 calls in 2233
        8176 calls to read() in 1684
        changing the min chunk size to 200 halved all the cache misses
        leading to a drop to:
        4168 calls in 1977
        4168 call to read() in 1646
        - i.e. just reduced the function call overhead. May be worth 
          keeping.
        """
        if size < 0: size = sys.maxint
        bufs = []
        readsize = min(200, size)    # Read from the file in small chunks
        while True:
            if size == 0:
                return "".join(bufs) # Return resulting line

            # c is the chunk
            c = self.read(readsize)
            # number of bytes read
            len_c = len(c)
            i = c.find('\n')
            if size is not None:
                # We set i=size to break out of the loop under two
                # conditions: 1) there's no newline, and the chunk is
                # larger than size, or 2) there is a newline, but the
                # resulting line would be longer than 'size'.
                if i==-1 and len_c > size: i=size-1
                elif size <= i: i = size -1

            if i >= 0 or c == '':
                # if i>= 0 we have a newline or have triggered the above
                # if size is not None condition.
                # if c == '' its EOF.
                bufs.append(c[:i+1])    # Add portion of last chunk
                # -- inlined self._unread --
                ## self._unread(c[i+1:], len_c - i)   # Push back rest of chunk
                self.extrabuf = c[i+1:] + self.extrabuf
                self.extrasize = len_c - i + self.extrasize
                self.offset -= len_c - i
                # -- end inlined self._unread --
                return ''.join(bufs)    # Return resulting line

            # Append chunk to list, decrease 'size',
            bufs.append(c)
            size = size - len_c
            readsize = min(size, readsize * 2)

    def readlines(self, sizehint=0):
        # optimise to avoid all the buffer manipulation
        # lsprof changed from:
        # 4168 calls in 5472 with 32000 calls to readline()
        # to :
        # 4168 calls in 417.
        # Negative numbers result in reading all the lines
        
        # python's gzip routine uses sizehint. This is a more efficient way
        # than python uses to honor it. But it is even more efficient to
        # just read the entire thing and use cStringIO to split into lines.
        # if sizehint <= 0:
        #     sizehint = -1
        # content = self.read(sizehint)
        # return bzrlib.osutils.split_lines(content)
        content = StringIO(self.read(-1))
        return content.readlines()

    def _unread(self, buf, len_buf=None):
        """tuned to remove unneeded len calls.
        
        because this is such an inner routine in readline, and readline is
        in many inner loops, this has been inlined into readline().

        The len_buf parameter combined with the reduction in len calls dropped
        the lsprof ms count for this routine on my test data from 800 to 200 - 
        a 75% saving.
        """
        if len_buf is None:
            len_buf = len(buf)
        self.extrabuf = buf + self.extrabuf
        self.extrasize = len_buf + self.extrasize
        self.offset -= len_buf

    def write(self, data):
        if self.mode != gzip.WRITE:
            import errno
            raise IOError(errno.EBADF, "write() on read-only GzipFile object")

        if self.fileobj is None:
            raise ValueError, "write() on closed GzipFile object"
        data_len = len(data)
        if data_len > 0:
            self.size = self.size + data_len
            self.crc = zlib.crc32(data, self.crc)
            self.fileobj.write( self.compress.compress(data) )
            self.offset += data_len

    def writelines(self, lines):
        # profiling indicated a significant overhead 
        # calling write for each line.
        # this batch call is a lot faster :).
        # (4 seconds to 1 seconds for the sample upgrades I was testing).
        self.write(''.join(lines))



2052.3.2 by John Arbash Meinel Change Copyright .. by Canonical to Copyright ... Canonical	1	# Copyright (C) 2005, 2006 Canonical Ltd
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	2	# Written by Robert Collins <robert.collins@canonical.com>
	3	#
	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18	"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
	19
1908.4.12 by John Arbash Meinel Minor change to tuned_gzip.	20	from cStringIO import StringIO
1908.4.5 by John Arbash Meinel Some small tweaks to knit and tuned_gzip to shave off another couple seconds	21
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	22	# make GzipFile faster:
	23	import gzip
	24	from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
	25	import sys
	26	import struct
	27	import zlib
	28
1666.1.6 by Robert Collins Make knit the default format.	29	# we want a \n preserved, break on \n only splitlines.
	30	import bzrlib
	31
2817.3.1 by Robert Collins * New helper ``bzrlib.tuned_gzip.bytes_to_gzip`` which takes a byte string	32	__all__ = ["GzipFile", "bytes_to_gzip"]
	33
	34
	35	def bytes_to_gzip(bytes, factory=zlib.compressobj,
	36	level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
	37	width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
	38	crc32=zlib.crc32):
	39	"""Create a gzip file containing bytes and return its content."""
	40	result = [
	41	'\037\213' # self.fileobj.write('\037\213') # magic header
	42	'\010' # self.fileobj.write('\010') # compression method
	43	# fname = self.filename[:-3]
	44	# flags = 0
	45	# if fname:
	46	# flags = FNAME
	47	'\x00' # self.fileobj.write(chr(flags))
	48	'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
	49	'\002' # self.fileobj.write('\002')
	50	'\377' # self.fileobj.write('\377')
	51	# if fname:
	52	'' # self.fileobj.write(fname + '\000')
	53	]
	54	# using a compressobj avoids a small header and trailer that the compress()
	55	# utility function adds.
	56	compress = factory(level, method, width, mem, 0)
	57	result.append(compress.compress(bytes))
	58	result.append(compress.flush())
	59	result.append(struct.pack("<L", LOWU32(crc32(bytes))))
	60	# size may exceed 2GB, or even 4GB
	61	result.append(struct.pack("<L", LOWU32(len(bytes))))
	62	return ''.join(result)
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	63
	64
	65	class GzipFile(gzip.GzipFile):
	66	"""Knit tuned version of GzipFile.
	67
	68	This is based on the following lsprof stats:
	69	python 2.4 stock GzipFile write:
	70	58971 0 5644.3090 2721.4730 gzip:193(write)
	71	+58971 0 1159.5530 1159.5530 +<built-in method compress>
	72	+176913 0 987.0320 987.0320 +<len>
	73	+58971 0 423.1450 423.1450 +<zlib.crc32>
	74	+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.
	75	StringO' objects>
	76	tuned GzipFile write:
	77	58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)
	78	+58971 0 1297.7620 1297.7620 +<built-in method compress>
	79	+58971 0 406.2160 406.2160 +<zlib.crc32>
	80	+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.
	81	StringO' objects>
	82	+58971 0 328.2670 328.2670 +<len>
	83
	84
	85	Yes, its only 1.6 seconds, but they add up.
	86	"""
	87
	88	def _add_read_data(self, data):
	89	# 4169 calls in 183
	90	# temp var for len(data) and switch to +='s.
	91	# 4169 in 139
	92	len_data = len(data)
	93	self.crc = zlib.crc32(data, self.crc)
	94	self.extrabuf += data
	95	self.extrasize += len_data
	96	self.size += len_data
	97
1908.4.3 by John Arbash Meinel Shave another second off of _record_to_data time, by optimizing single write versus multiple writes	98	def _write_gzip_header(self):
	99	"""A tuned version of gzip._write_gzip_header
	100
	101	We have some extra constrains that plain Gzip does not.
1908.4.10 by John Arbash Meinel Small cleanups	102	1) We want to write the whole blob at once. rather than multiple
1908.4.10 by John Arbash Meinel Small cleanups	103	calls to fileobj.write().
1908.4.3 by John Arbash Meinel Shave another second off of _record_to_data time, by optimizing single write versus multiple writes	104	2) We never have a filename
	105	3) We don't care about the time
	106	"""
	107	self.fileobj.write(
	108	'\037\213' # self.fileobj.write('\037\213') # magic header
	109	'\010' # self.fileobj.write('\010') # compression method
	110	# fname = self.filename[:-3]
	111	# flags = 0
	112	# if fname:
	113	# flags = FNAME
	114	'\x00' # self.fileobj.write(chr(flags))
	115	'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
	116	'\002' # self.fileobj.write('\002')
	117	'\377' # self.fileobj.write('\377')
	118	# if fname:
	119	'' # self.fileobj.write(fname + '\000')
	120	)
	121
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	122	def _read(self, size=1024):
	123	# various optimisations:
	124	# reduces lsprof count from 2500 to
	125	# 8337 calls in 1272, 365 internal
	126	if self.fileobj is None:
	127	raise EOFError, "Reached EOF"
	128
	129	if self._new_member:
	130	# If the _new_member flag is set, we have to
	131	# jump to the next member, if there is one.
	132	#
	133	# First, check if we're at the end of the file;
	134	# if so, it's time to stop; no more members to read.
	135	next_header_bytes = self.fileobj.read(10)
	136	if next_header_bytes == '':
	137	raise EOFError, "Reached EOF"
	138
	139	self._init_read()
	140	self._read_gzip_header(next_header_bytes)
	141	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
	142	self._new_member = False
	143
	144	# Read a chunk of data from the file
	145	buf = self.fileobj.read(size)
	146
	147	# If the EOF has been reached, flush the decompression object
	148	# and mark this object as finished.
	149
	150	if buf == "":
	151	self._add_read_data(self.decompress.flush())
	152	assert len(self.decompress.unused_data) >= 8, "what does flush do?"
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	153	self._gzip_tail = self.decompress.unused_data[0:8]
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	154	self._read_eof()
	155	# tell the driving read() call we have stuffed all the data
	156	# in self.extrabuf
	157	raise EOFError, 'Reached EOF'
	158
	159	self._add_read_data(self.decompress.decompress(buf))
	160
	161	if self.decompress.unused_data != "":
	162	# Ending case: we've come to the end of a member in the file,
	163	# so seek back to the start of the data for the next member which
	164	# is the length of the decompress objects unused data - the first
	165	# 8 bytes for the end crc and size records.
	166	#
	167	# so seek back to the start of the unused data, finish up
	168	# this member, and read a new gzip header.
	169	# (The number of bytes to seek back is the length of the unused
	170	# data, minus 8 because those 8 bytes are part of this member.
	171	seek_length = len (self.decompress.unused_data) - 8
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	172	if seek_length > 0:
	173	# we read too much data
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	174	self.fileobj.seek(-seek_length, 1)
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	175	self._gzip_tail = self.decompress.unused_data[0:8]
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	176	elif seek_length < 0:
	177	# we haven't read enough to check the checksum.
	178	assert -8 < seek_length, "too great a seek."
	179	buf = self.fileobj.read(-seek_length)
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	180	self._gzip_tail = self.decompress.unused_data + buf
	181	else:
	182	self._gzip_tail = self.decompress.unused_data
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	183
	184	# Check the CRC and file size, and set the flag so we read
	185	# a new member on the next call
	186	self._read_eof()
	187	self._new_member = True
	188
	189	def _read_eof(self):
	190	"""tuned to reduce function calls and eliminate file seeking:
	191	pass 1:
	192	reduces lsprof count from 800 to 288
	193	4168 in 296
	194	avoid U32 call by using struct format L
	195	4168 in 200
	196	"""
	197	# We've read to the end of the file, so we should have 8 bytes of
1759.2.2 by Jelmer Vernooij Revert some of my spelling fixes and fix some typos after review by Aaron.	198	# unused data in the decompressor. If we don't, there is a corrupt file.
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	199	# We use these 8 bytes to calculate the CRC and the recorded file size.
	200	# We then check the that the computed CRC and size of the
	201	# uncompressed data matches the stored values. Note that the size
	202	# stored is the true file size mod 2**32.
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	203	assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."
	204	crc32, isize = struct.unpack("<LL", self._gzip_tail)
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	205	# note that isize is unsigned - it can exceed 2GB
	206	if crc32 != U32(self.crc):
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	207	raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	208	elif isize != LOWU32(self.size):
	209	raise IOError, "Incorrect length of data produced"
	210
	211	def _read_gzip_header(self, bytes=None):
	212	"""Supply bytes if the minimum header size is already read.
	213
	214	:param bytes: 10 bytes of header data.
	215	"""
	216	"""starting cost: 300 in 3998
	217	15998 reads from 3998 calls
	218	final cost 168
	219	"""
	220	if bytes is None:
	221	bytes = self.fileobj.read(10)
	222	magic = bytes[0:2]
	223	if magic != '\037\213':
	224	raise IOError, 'Not a gzipped file'
	225	method = ord(bytes[2:3])
	226	if method != 8:
	227	raise IOError, 'Unknown compression method'
	228	flag = ord(bytes[3:4])
	229	# modtime = self.fileobj.read(4) (bytes [4:8])
	230	# extraflag = self.fileobj.read(1) (bytes[8:9])
	231	# os = self.fileobj.read(1) (bytes[9:10])
	232	# self.fileobj.read(6)
	233
	234	if flag & FEXTRA:
	235	# Read & discard the extra field, if present
	236	xlen = ord(self.fileobj.read(1))
	237	xlen = xlen + 256*ord(self.fileobj.read(1))
	238	self.fileobj.read(xlen)
	239	if flag & FNAME:
	240	# Read and discard a null-terminated string containing the filename
	241	while True:
	242	s = self.fileobj.read(1)
	243	if not s or s=='\000':
	244	break
	245	if flag & FCOMMENT:
	246	# Read and discard a null-terminated string containing a comment
	247	while True:
	248	s = self.fileobj.read(1)
	249	if not s or s=='\000':
	250	break
	251	if flag & FHCRC:
	252	self.fileobj.read(2) # Read & discard the 16-bit header CRC
	253
	254	def readline(self, size=-1):
	255	"""Tuned to remove buffer length calls in _unread and...
	256
	257	also removes multiple len(c) calls, inlines _unread,
	258	total savings - lsprof 5800 to 5300
	259	phase 2:
	260	4168 calls in 2233
	261	8176 calls to read() in 1684
	262	changing the min chunk size to 200 halved all the cache misses
	263	leading to a drop to:
	264	4168 calls in 1977
	265	4168 call to read() in 1646
	266	- i.e. just reduced the function call overhead. May be worth
	267	keeping.
	268	"""
	269	if size < 0: size = sys.maxint
	270	bufs = []
	271	readsize = min(200, size) # Read from the file in small chunks
272	while True:
273	if size == 0:
274	return "".join(bufs) # Return resulting line
275
276	# c is the chunk
277	c = self.read(readsize)
278	# number of bytes read
279	len_c = len(c)
280	i = c.find('\n')
281	if size is not None:
282	# We set i=size to break out of the loop under two
283	# conditions: 1) there's no newline, and the chunk is
284	# larger than size, or 2) there is a newline, but the
285	# resulting line would be longer than 'size'.
286	if i==-1 and len_c > size: i=size-1
287	elif size <= i: i = size -1
288
289	if i >= 0 or c == '':
290	# if i>= 0 we have a newline or have triggered the above
291	# if size is not None condition.
292	# if c == '' its EOF.
293	bufs.append(c[:i+1]) # Add portion of last chunk
294	# -- inlined self._unread --
295	## self._unread(c[i+1:], len_c - i) # Push back rest of chunk
296	self.extrabuf = c[i+1:] + self.extrabuf
297	self.extrasize = len_c - i + self.extrasize
298	self.offset -= len_c - i
299	# -- end inlined self._unread --
300	return ''.join(bufs) # Return resulting line
301
302	# Append chunk to list, decrease 'size',
303	bufs.append(c)
304	size = size - len_c
305	readsize = min(size, readsize * 2)
306
307	def readlines(self, sizehint=0):
308	# optimise to avoid all the buffer manipulation
309	# lsprof changed from:
310	# 4168 calls in 5472 with 32000 calls to readline()
311	# to :
312	# 4168 calls in 417.
313	# Negative numbers result in reading all the lines
1908.4.15 by John Arbash Meinel comment on tuned_gzip.readlines() functionality.	314
	315	# python's gzip routine uses sizehint. This is a more efficient way
	316	# than python uses to honor it. But it is even more efficient to
	317	# just read the entire thing and use cStringIO to split into lines.
	318	# if sizehint <= 0:
	319	# sizehint = -1
	320	# content = self.read(sizehint)
	321	# return bzrlib.osutils.split_lines(content)
1908.4.12 by John Arbash Meinel Minor change to tuned_gzip.	322	content = StringIO(self.read(-1))
1908.4.5 by John Arbash Meinel Some small tweaks to knit and tuned_gzip to shave off another couple seconds	323	return content.readlines()
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	324
	325	def _unread(self, buf, len_buf=None):
	326	"""tuned to remove unneeded len calls.
	327
	328	because this is such an inner routine in readline, and readline is
	329	in many inner loops, this has been inlined into readline().
	330
	331	The len_buf parameter combined with the reduction in len calls dropped
	332	the lsprof ms count for this routine on my test data from 800 to 200 -
	333	a 75% saving.
	334	"""
	335	if len_buf is None:
	336	len_buf = len(buf)
	337	self.extrabuf = buf + self.extrabuf
	338	self.extrasize = len_buf + self.extrasize
	339	self.offset -= len_buf
	340
	341	def write(self, data):
	342	if self.mode != gzip.WRITE:
	343	import errno
	344	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
	345
	346	if self.fileobj is None:
	347	raise ValueError, "write() on closed GzipFile object"
	348	data_len = len(data)
	349	if data_len > 0:
	350	self.size = self.size + data_len
	351	self.crc = zlib.crc32(data, self.crc)
	352	self.fileobj.write( self.compress.compress(data) )
	353	self.offset += data_len
	354
	355	def writelines(self, lines):
	356	# profiling indicated a significant overhead
	357	# calling write for each line.
	358	# this batch call is a lot faster :).
	359	# (4 seconds to 1 seconds for the sample upgrades I was testing).
	360	self.write(''.join(lines))
	361
	362