~bzr-pqm/bzr/bzr.dev : contents of bzrlib/tuned

~bzr-pqm/bzr/bzr.dev : (revision 3688.3.3)

# Copyright (C) 2005, 2006 Canonical Ltd
# Written by Robert Collins <robert.collins@canonical.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

from cStringIO import StringIO

# make GzipFile faster:
import gzip
from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
import sys
import struct
import zlib

# we want a \n preserved, break on \n only splitlines.
import bzrlib

__all__ = ["GzipFile", "bytes_to_gzip"]


def bytes_to_gzip(bytes, factory=zlib.compressobj,
    level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
    width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
    crc32=zlib.crc32):
    """Create a gzip file containing bytes and return its content."""
    result = [
        '\037\213'  # self.fileobj.write('\037\213')  # magic header
        '\010'      # self.fileobj.write('\010')      # compression method
                    # fname = self.filename[:-3]
                    # flags = 0
                    # if fname:
                    #     flags = FNAME
        '\x00'      # self.fileobj.write(chr(flags))
        '\0\0\0\0'  # write32u(self.fileobj, long(time.time()))
        '\002'      # self.fileobj.write('\002')
        '\377'      # self.fileobj.write('\377')
                    # if fname:
        ''          #     self.fileobj.write(fname + '\000')
        ]
    # using a compressobj avoids a small header and trailer that the compress()
    # utility function adds.
    compress = factory(level, method, width, mem, 0)
    result.append(compress.compress(bytes))
    result.append(compress.flush())
    result.append(struct.pack("<L", LOWU32(crc32(bytes))))
    # size may exceed 2GB, or even 4GB
    result.append(struct.pack("<L", LOWU32(len(bytes))))
    return ''.join(result)


class GzipFile(gzip.GzipFile):
    """Knit tuned version of GzipFile.

    This is based on the following lsprof stats:
    python 2.4 stock GzipFile write:
    58971      0   5644.3090   2721.4730   gzip:193(write)
    +58971     0   1159.5530   1159.5530   +<built-in method compress>
    +176913    0    987.0320    987.0320   +<len>
    +58971     0    423.1450    423.1450   +<zlib.crc32>
    +58971     0    353.1060    353.1060   +<method 'write' of 'cStringIO.
                                            StringO' objects>
    tuned GzipFile write:
    58971      0   4477.2590   2103.1120   bzrlib.knit:1250(write)
    +58971     0   1297.7620   1297.7620   +<built-in method compress>
    +58971     0    406.2160    406.2160   +<zlib.crc32>
    +58971     0    341.9020    341.9020   +<method 'write' of 'cStringIO.
                                            StringO' objects>
    +58971     0    328.2670    328.2670   +<len>


    Yes, its only 1.6 seconds, but they add up.
    """

    def _add_read_data(self, data):
        # 4169 calls in 183
        # temp var for len(data) and switch to +='s.
        # 4169 in 139
        len_data = len(data)
        self.crc = zlib.crc32(data, self.crc)
        self.extrabuf += data
        self.extrasize += len_data
        self.size += len_data

    def _write_gzip_header(self):
        """A tuned version of gzip._write_gzip_header

        We have some extra constrains that plain Gzip does not.
        1) We want to write the whole blob at once. rather than multiple 
           calls to fileobj.write().
        2) We never have a filename
        3) We don't care about the time
        """
        self.fileobj.write(
           '\037\213'   # self.fileobj.write('\037\213')  # magic header
            '\010'      # self.fileobj.write('\010')      # compression method
                        # fname = self.filename[:-3]
                        # flags = 0
                        # if fname:
                        #     flags = FNAME
            '\x00'      # self.fileobj.write(chr(flags))
            '\0\0\0\0'  # write32u(self.fileobj, long(time.time()))
            '\002'      # self.fileobj.write('\002')
            '\377'      # self.fileobj.write('\377')
                        # if fname:
            ''          #     self.fileobj.write(fname + '\000')
            )

    def _read(self, size=1024):
        # various optimisations:
        # reduces lsprof count from 2500 to 
        # 8337 calls in 1272, 365 internal
        if self.fileobj is None:
            raise EOFError, "Reached EOF"

        if self._new_member:
            # If the _new_member flag is set, we have to
            # jump to the next member, if there is one.
            #
            # First, check if we're at the end of the file;
            # if so, it's time to stop; no more members to read.
            next_header_bytes = self.fileobj.read(10)
            if next_header_bytes == '':
                raise EOFError, "Reached EOF"

            self._init_read()
            self._read_gzip_header(next_header_bytes)
            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
            self._new_member = False

        # Read a chunk of data from the file
        buf = self.fileobj.read(size)

        # If the EOF has been reached, flush the decompression object
        # and mark this object as finished.

        if buf == "":
            self._add_read_data(self.decompress.flush())
            if len(self.decompress.unused_data) < 8:
                raise AssertionError("what does flush do?")
            self._gzip_tail = self.decompress.unused_data[0:8]
            self._read_eof()
            # tell the driving read() call we have stuffed all the data
            # in self.extrabuf
            raise EOFError, 'Reached EOF'

        self._add_read_data(self.decompress.decompress(buf))

        if self.decompress.unused_data != "":
            # Ending case: we've come to the end of a member in the file,
            # so seek back to the start of the data for the next member which
            # is the length of the decompress objects unused data - the first
            # 8 bytes for the end crc and size records.
            #
            # so seek back to the start of the unused data, finish up
            # this member, and read a new gzip header.
            # (The number of bytes to seek back is the length of the unused
            # data, minus 8 because those 8 bytes are part of this member.
            seek_length = len (self.decompress.unused_data) - 8
            if seek_length > 0:
                # we read too much data
                self.fileobj.seek(-seek_length, 1)
                self._gzip_tail = self.decompress.unused_data[0:8]
            elif seek_length < 0:
                # we haven't read enough to check the checksum.
                if not (-8 < seek_length):
                    raise AssertionError("too great a seek")
                buf = self.fileobj.read(-seek_length)
                self._gzip_tail = self.decompress.unused_data + buf
            else:
                self._gzip_tail = self.decompress.unused_data

            # Check the CRC and file size, and set the flag so we read
            # a new member on the next call
            self._read_eof()
            self._new_member = True

    def _read_eof(self):
        """tuned to reduce function calls and eliminate file seeking:
        pass 1:
        reduces lsprof count from 800 to 288
        4168 in 296 
        avoid U32 call by using struct format L
        4168 in 200
        """
        # We've read to the end of the file, so we should have 8 bytes of 
        # unused data in the decompressor. If we don't, there is a corrupt file.
        # We use these 8 bytes to calculate the CRC and the recorded file size.
        # We then check the that the computed CRC and size of the
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
        if not (len(self._gzip_tail) == 8):
            raise AssertionError("gzip trailer is incorrect length.")
        crc32, isize = struct.unpack("<LL", self._gzip_tail)
        # note that isize is unsigned - it can exceed 2GB
        if crc32 != U32(self.crc):
            raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))
        elif isize != LOWU32(self.size):
            raise IOError, "Incorrect length of data produced"

    def _read_gzip_header(self, bytes=None):
        """Supply bytes if the minimum header size is already read.
        
        :param bytes: 10 bytes of header data.
        """
        """starting cost: 300 in 3998
        15998 reads from 3998 calls
        final cost 168
        """
        if bytes is None:
            bytes = self.fileobj.read(10)
        magic = bytes[0:2]
        if magic != '\037\213':
            raise IOError, 'Not a gzipped file'
        method = ord(bytes[2:3])
        if method != 8:
            raise IOError, 'Unknown compression method'
        flag = ord(bytes[3:4])
        # modtime = self.fileobj.read(4) (bytes [4:8])
        # extraflag = self.fileobj.read(1) (bytes[8:9])
        # os = self.fileobj.read(1) (bytes[9:10])
        # self.fileobj.read(6)

        if flag & FEXTRA:
            # Read & discard the extra field, if present
            xlen = ord(self.fileobj.read(1))
            xlen = xlen + 256*ord(self.fileobj.read(1))
            self.fileobj.read(xlen)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while True:
                s = self.fileobj.read(1)
                if not s or s=='\000':
                    break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while True:
                s = self.fileobj.read(1)
                if not s or s=='\000':
                    break
        if flag & FHCRC:
            self.fileobj.read(2)     # Read & discard the 16-bit header CRC

    def readline(self, size=-1):
        """Tuned to remove buffer length calls in _unread and...
        
        also removes multiple len(c) calls, inlines _unread,
        total savings - lsprof 5800 to 5300
        phase 2:
        4168 calls in 2233
        8176 calls to read() in 1684
        changing the min chunk size to 200 halved all the cache misses
        leading to a drop to:
        4168 calls in 1977
        4168 call to read() in 1646
        - i.e. just reduced the function call overhead. May be worth 
          keeping.
        """
        if size < 0: size = sys.maxint
        bufs = []
        readsize = min(200, size)    # Read from the file in small chunks
        while True:
            if size == 0:
                return "".join(bufs) # Return resulting line

            # c is the chunk
            c = self.read(readsize)
            # number of bytes read
            len_c = len(c)
            i = c.find('\n')
            if size is not None:
                # We set i=size to break out of the loop under two
                # conditions: 1) there's no newline, and the chunk is
                # larger than size, or 2) there is a newline, but the
                # resulting line would be longer than 'size'.
                if i==-1 and len_c > size: i=size-1
                elif size <= i: i = size -1

            if i >= 0 or c == '':
                # if i>= 0 we have a newline or have triggered the above
                # if size is not None condition.
                # if c == '' its EOF.
                bufs.append(c[:i+1])    # Add portion of last chunk
                # -- inlined self._unread --
                ## self._unread(c[i+1:], len_c - i)   # Push back rest of chunk
                self.extrabuf = c[i+1:] + self.extrabuf
                self.extrasize = len_c - i + self.extrasize
                self.offset -= len_c - i
                # -- end inlined self._unread --
                return ''.join(bufs)    # Return resulting line

            # Append chunk to list, decrease 'size',
            bufs.append(c)
            size = size - len_c
            readsize = min(size, readsize * 2)

    def readlines(self, sizehint=0):
        # optimise to avoid all the buffer manipulation
        # lsprof changed from:
        # 4168 calls in 5472 with 32000 calls to readline()
        # to :
        # 4168 calls in 417.
        # Negative numbers result in reading all the lines
        
        # python's gzip routine uses sizehint. This is a more efficient way
        # than python uses to honor it. But it is even more efficient to
        # just read the entire thing and use cStringIO to split into lines.
        # if sizehint <= 0:
        #     sizehint = -1
        # content = self.read(sizehint)
        # return bzrlib.osutils.split_lines(content)
        content = StringIO(self.read(-1))
        return content.readlines()

    def _unread(self, buf, len_buf=None):
        """tuned to remove unneeded len calls.
        
        because this is such an inner routine in readline, and readline is
        in many inner loops, this has been inlined into readline().

        The len_buf parameter combined with the reduction in len calls dropped
        the lsprof ms count for this routine on my test data from 800 to 200 - 
        a 75% saving.
        """
        if len_buf is None:
            len_buf = len(buf)
        self.extrabuf = buf + self.extrabuf
        self.extrasize = len_buf + self.extrasize
        self.offset -= len_buf

    def write(self, data):
        if self.mode != gzip.WRITE:
            import errno
            raise IOError(errno.EBADF, "write() on read-only GzipFile object")

        if self.fileobj is None:
            raise ValueError, "write() on closed GzipFile object"
        data_len = len(data)
        if data_len > 0:
            self.size = self.size + data_len
            self.crc = zlib.crc32(data, self.crc)
            self.fileobj.write( self.compress.compress(data) )
            self.offset += data_len

    def writelines(self, lines):
        # profiling indicated a significant overhead 
        # calling write for each line.
        # this batch call is a lot faster :).
        # (4 seconds to 1 seconds for the sample upgrades I was testing).
        self.write(''.join(lines))



2052.3.2 by John Arbash Meinel Change Copyright .. by Canonical to Copyright ... Canonical	1	# Copyright (C) 2005, 2006 Canonical Ltd
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	2	# Written by Robert Collins <robert.collins@canonical.com>
	3	#
	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18	"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""
	19
1908.4.12 by John Arbash Meinel Minor change to tuned_gzip.	20	from cStringIO import StringIO
1908.4.5 by John Arbash Meinel Some small tweaks to knit and tuned_gzip to shave off another couple seconds	21
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	22	# make GzipFile faster:
	23	import gzip
	24	from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC
	25	import sys
	26	import struct
	27	import zlib
	28
1666.1.6 by Robert Collins Make knit the default format.	29	# we want a \n preserved, break on \n only splitlines.
	30	import bzrlib
	31
2817.3.1 by Robert Collins * New helper ``bzrlib.tuned_gzip.bytes_to_gzip`` which takes a byte string	32	__all__ = ["GzipFile", "bytes_to_gzip"]
	33
	34
	35	def bytes_to_gzip(bytes, factory=zlib.compressobj,
	36	level=zlib.Z_DEFAULT_COMPRESSION, method=zlib.DEFLATED,
	37	width=-zlib.MAX_WBITS, mem=zlib.DEF_MEM_LEVEL,
	38	crc32=zlib.crc32):
	39	"""Create a gzip file containing bytes and return its content."""
	40	result = [
	41	'\037\213' # self.fileobj.write('\037\213') # magic header
	42	'\010' # self.fileobj.write('\010') # compression method
	43	# fname = self.filename[:-3]
	44	# flags = 0
	45	# if fname:
	46	# flags = FNAME
	47	'\x00' # self.fileobj.write(chr(flags))
	48	'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
	49	'\002' # self.fileobj.write('\002')
	50	'\377' # self.fileobj.write('\377')
	51	# if fname:
	52	'' # self.fileobj.write(fname + '\000')
	53	]
	54	# using a compressobj avoids a small header and trailer that the compress()
	55	# utility function adds.
	56	compress = factory(level, method, width, mem, 0)
	57	result.append(compress.compress(bytes))
	58	result.append(compress.flush())
	59	result.append(struct.pack("<L", LOWU32(crc32(bytes))))
	60	# size may exceed 2GB, or even 4GB
	61	result.append(struct.pack("<L", LOWU32(len(bytes))))
	62	return ''.join(result)
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	63
	64
	65	class GzipFile(gzip.GzipFile):
	66	"""Knit tuned version of GzipFile.
	67
	68	This is based on the following lsprof stats:
	69	python 2.4 stock GzipFile write:
	70	58971 0 5644.3090 2721.4730 gzip:193(write)
	71	+58971 0 1159.5530 1159.5530 +<built-in method compress>
	72	+176913 0 987.0320 987.0320 +<len>
	73	+58971 0 423.1450 423.1450 +<zlib.crc32>
	74	+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.
	75	StringO' objects>
	76	tuned GzipFile write:
	77	58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)
	78	+58971 0 1297.7620 1297.7620 +<built-in method compress>
	79	+58971 0 406.2160 406.2160 +<zlib.crc32>
	80	+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.
	81	StringO' objects>
	82	+58971 0 328.2670 328.2670 +<len>
	83
	84
	85	Yes, its only 1.6 seconds, but they add up.
	86	"""
	87
	88	def _add_read_data(self, data):
	89	# 4169 calls in 183
	90	# temp var for len(data) and switch to +='s.
	91	# 4169 in 139
	92	len_data = len(data)
	93	self.crc = zlib.crc32(data, self.crc)
	94	self.extrabuf += data
	95	self.extrasize += len_data
	96	self.size += len_data
	97
1908.4.3 by John Arbash Meinel Shave another second off of _record_to_data time, by optimizing single write versus multiple writes	98	def _write_gzip_header(self):
	99	"""A tuned version of gzip._write_gzip_header
	100
	101	We have some extra constrains that plain Gzip does not.
1908.4.10 by John Arbash Meinel Small cleanups	102	1) We want to write the whole blob at once. rather than multiple
1908.4.10 by John Arbash Meinel Small cleanups	103	calls to fileobj.write().
1908.4.3 by John Arbash Meinel Shave another second off of _record_to_data time, by optimizing single write versus multiple writes	104	2) We never have a filename
	105	3) We don't care about the time
	106	"""
	107	self.fileobj.write(
	108	'\037\213' # self.fileobj.write('\037\213') # magic header
	109	'\010' # self.fileobj.write('\010') # compression method
	110	# fname = self.filename[:-3]
	111	# flags = 0
	112	# if fname:
	113	# flags = FNAME
	114	'\x00' # self.fileobj.write(chr(flags))
	115	'\0\0\0\0' # write32u(self.fileobj, long(time.time()))
	116	'\002' # self.fileobj.write('\002')
	117	'\377' # self.fileobj.write('\377')
	118	# if fname:
	119	'' # self.fileobj.write(fname + '\000')
	120	)
	121
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	122	def _read(self, size=1024):
	123	# various optimisations:
	124	# reduces lsprof count from 2500 to
	125	# 8337 calls in 1272, 365 internal
	126	if self.fileobj is None:
	127	raise EOFError, "Reached EOF"
	128
	129	if self._new_member:
	130	# If the _new_member flag is set, we have to
	131	# jump to the next member, if there is one.
	132	#
	133	# First, check if we're at the end of the file;
	134	# if so, it's time to stop; no more members to read.
	135	next_header_bytes = self.fileobj.read(10)
	136	if next_header_bytes == '':
	137	raise EOFError, "Reached EOF"
	138
	139	self._init_read()
	140	self._read_gzip_header(next_header_bytes)
	141	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
	142	self._new_member = False
	143
	144	# Read a chunk of data from the file
	145	buf = self.fileobj.read(size)
	146
	147	# If the EOF has been reached, flush the decompression object
	148	# and mark this object as finished.
	149
	150	if buf == "":
	151	self._add_read_data(self.decompress.flush())
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	152	if len(self.decompress.unused_data) < 8:
	153	raise AssertionError("what does flush do?")
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	154	self._gzip_tail = self.decompress.unused_data[0:8]
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	155	self._read_eof()
	156	# tell the driving read() call we have stuffed all the data
	157	# in self.extrabuf
	158	raise EOFError, 'Reached EOF'
	159
	160	self._add_read_data(self.decompress.decompress(buf))
	161
	162	if self.decompress.unused_data != "":
	163	# Ending case: we've come to the end of a member in the file,
	164	# so seek back to the start of the data for the next member which
	165	# is the length of the decompress objects unused data - the first
	166	# 8 bytes for the end crc and size records.
	167	#
	168	# so seek back to the start of the unused data, finish up
	169	# this member, and read a new gzip header.
	170	# (The number of bytes to seek back is the length of the unused
	171	# data, minus 8 because those 8 bytes are part of this member.
	172	seek_length = len (self.decompress.unused_data) - 8
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	173	if seek_length > 0:
	174	# we read too much data
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	175	self.fileobj.seek(-seek_length, 1)
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	176	self._gzip_tail = self.decompress.unused_data[0:8]
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	177	elif seek_length < 0:
	178	# we haven't read enough to check the checksum.
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	179	if not (-8 < seek_length):
	180	raise AssertionError("too great a seek")
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	181	buf = self.fileobj.read(-seek_length)
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	182	self._gzip_tail = self.decompress.unused_data + buf
	183	else:
	184	self._gzip_tail = self.decompress.unused_data
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	185
	186	# Check the CRC and file size, and set the flag so we read
	187	# a new member on the next call
	188	self._read_eof()
	189	self._new_member = True
	190
	191	def _read_eof(self):
	192	"""tuned to reduce function calls and eliminate file seeking:
	193	pass 1:
	194	reduces lsprof count from 800 to 288
	195	4168 in 296
	196	avoid U32 call by using struct format L
	197	4168 in 200
	198	"""
	199	# We've read to the end of the file, so we should have 8 bytes of
1759.2.2 by Jelmer Vernooij Revert some of my spelling fixes and fix some typos after review by Aaron.	200	# unused data in the decompressor. If we don't, there is a corrupt file.
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	201	# We use these 8 bytes to calculate the CRC and the recorded file size.
	202	# We then check the that the computed CRC and size of the
	203	# uncompressed data matches the stored values. Note that the size
	204	# stored is the true file size mod 2**32.
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	205	if not (len(self._gzip_tail) == 8):
	206	raise AssertionError("gzip trailer is incorrect length.")
1666.1.11 by Robert Collins Really fix short-read support in tuned_gzip. The python zlib module behaved differently than thought.	207	crc32, isize = struct.unpack("<LL", self._gzip_tail)
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	208	# note that isize is unsigned - it can exceed 2GB
	209	if crc32 != U32(self.crc):
1666.1.2 by Robert Collins Fix race condition between end of stream and end of file with tuned_gzip.	210	raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	211	elif isize != LOWU32(self.size):
	212	raise IOError, "Incorrect length of data produced"
	213
	214	def _read_gzip_header(self, bytes=None):
	215	"""Supply bytes if the minimum header size is already read.
	216
	217	:param bytes: 10 bytes of header data.
	218	"""
	219	"""starting cost: 300 in 3998
	220	15998 reads from 3998 calls
	221	final cost 168
	222	"""
	223	if bytes is None:
	224	bytes = self.fileobj.read(10)
	225	magic = bytes[0:2]
	226	if magic != '\037\213':
	227	raise IOError, 'Not a gzipped file'
	228	method = ord(bytes[2:3])
	229	if method != 8:
	230	raise IOError, 'Unknown compression method'
	231	flag = ord(bytes[3:4])
	232	# modtime = self.fileobj.read(4) (bytes [4:8])
	233	# extraflag = self.fileobj.read(1) (bytes[8:9])
	234	# os = self.fileobj.read(1) (bytes[9:10])
	235	# self.fileobj.read(6)
	236
	237	if flag & FEXTRA:
	238	# Read & discard the extra field, if present
	239	xlen = ord(self.fileobj.read(1))
	240	xlen = xlen + 256*ord(self.fileobj.read(1))
	241	self.fileobj.read(xlen)
	242	if flag & FNAME:
	243	# Read and discard a null-terminated string containing the filename
	244	while True:
	245	s = self.fileobj.read(1)
	246	if not s or s=='\000':
	247	break
	248	if flag & FCOMMENT:
	249	# Read and discard a null-terminated string containing a comment
	250	while True:
	251	s = self.fileobj.read(1)
	252	if not s or s=='\000':
	253	break
	254	if flag & FHCRC:
	255	self.fileobj.read(2) # Read & discard the 16-bit header CRC
	256
	257	def readline(self, size=-1):
	258	"""Tuned to remove buffer length calls in _unread and...
	259
	260	also removes multiple len(c) calls, inlines _unread,
	261	total savings - lsprof 5800 to 5300
	262	phase 2:
	263	4168 calls in 2233
	264	8176 calls to read() in 1684
	265	changing the min chunk size to 200 halved all the cache misses
	266	leading to a drop to:
	267	4168 calls in 1977
	268	4168 call to read() in 1646
	269	- i.e. just reduced the function call overhead. May be worth
	270	keeping.
	271	"""
	272	if size < 0: size = sys.maxint
	273	bufs = []
	274	readsize = min(200, size) # Read from the file in small chunks
275	while True:
276	if size == 0:
277	return "".join(bufs) # Return resulting line
278
279	# c is the chunk
280	c = self.read(readsize)
281	# number of bytes read
282	len_c = len(c)
283	i = c.find('\n')
284	if size is not None:
285	# We set i=size to break out of the loop under two
286	# conditions: 1) there's no newline, and the chunk is
287	# larger than size, or 2) there is a newline, but the
288	# resulting line would be longer than 'size'.
289	if i==-1 and len_c > size: i=size-1
290	elif size <= i: i = size -1
291
292	if i >= 0 or c == '':
293	# if i>= 0 we have a newline or have triggered the above
294	# if size is not None condition.
295	# if c == '' its EOF.
296	bufs.append(c[:i+1]) # Add portion of last chunk
297	# -- inlined self._unread --
298	## self._unread(c[i+1:], len_c - i) # Push back rest of chunk
299	self.extrabuf = c[i+1:] + self.extrabuf
300	self.extrasize = len_c - i + self.extrasize
301	self.offset -= len_c - i
302	# -- end inlined self._unread --
303	return ''.join(bufs) # Return resulting line
304
305	# Append chunk to list, decrease 'size',
306	bufs.append(c)
307	size = size - len_c
308	readsize = min(size, readsize * 2)
309
310	def readlines(self, sizehint=0):
311	# optimise to avoid all the buffer manipulation
312	# lsprof changed from:
313	# 4168 calls in 5472 with 32000 calls to readline()
314	# to :
315	# 4168 calls in 417.
316	# Negative numbers result in reading all the lines
1908.4.15 by John Arbash Meinel comment on tuned_gzip.readlines() functionality.	317
	318	# python's gzip routine uses sizehint. This is a more efficient way
	319	# than python uses to honor it. But it is even more efficient to
	320	# just read the entire thing and use cStringIO to split into lines.
	321	# if sizehint <= 0:
	322	# sizehint = -1
	323	# content = self.read(sizehint)
	324	# return bzrlib.osutils.split_lines(content)
1908.4.12 by John Arbash Meinel Minor change to tuned_gzip.	325	content = StringIO(self.read(-1))
1908.4.5 by John Arbash Meinel Some small tweaks to knit and tuned_gzip to shave off another couple seconds	326	return content.readlines()
1641.1.1 by Robert Collins * Various microoptimisations to knit and gzip - reducing function call	327
	328	def _unread(self, buf, len_buf=None):
	329	"""tuned to remove unneeded len calls.
	330
	331	because this is such an inner routine in readline, and readline is
	332	in many inner loops, this has been inlined into readline().
	333
	334	The len_buf parameter combined with the reduction in len calls dropped
	335	the lsprof ms count for this routine on my test data from 800 to 200 -
	336	a 75% saving.
	337	"""
	338	if len_buf is None:
	339	len_buf = len(buf)
	340	self.extrabuf = buf + self.extrabuf
	341	self.extrasize = len_buf + self.extrasize
	342	self.offset -= len_buf
	343
	344	def write(self, data):
	345	if self.mode != gzip.WRITE:
	346	import errno
	347	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
	348
	349	if self.fileobj is None:
	350	raise ValueError, "write() on closed GzipFile object"
	351	data_len = len(data)
	352	if data_len > 0:
	353	self.size = self.size + data_len
	354	self.crc = zlib.crc32(data, self.crc)
	355	self.fileobj.write( self.compress.compress(data) )
	356	self.offset += data_len
	357
	358	def writelines(self, lines):
	359	# profiling indicated a significant overhead
	360	# calling write for each line.
	361	# this batch call is a lot faster :).
	362	# (4 seconds to 1 seconds for the sample upgrades I was testing).
	363	self.write(''.join(lines))
	364
	365