~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: Robert Collins
Date: 2006-02-11 11:58:06 UTC
mto: (1534.1.22 integration)
mto: This revision was merged to the branch mainline in revision 1554.
Revision ID: robertc@robertcollins.net-20060211115806-732dabc1e35714ed

Give format3 working trees their own last-revision marker.

files added:
bzrlib/_merge_core.py

bzrlib/tests/test_reweave.py

bzrlib/util/configobj/validate.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

doc/Makefile

doc/adoption.txt

doc/bitkeeper.txt

doc/changelogs.txt

doc/cherry-picking.txt

doc/cmdref.txt

doc/common-format.txt

doc/compared-aegis.txt

doc/compared-codeville.txt

doc/compared-cvsnt.txt

doc/compared-opencm.txt

doc/compared-prcs.txt

doc/compared-teamware.txt

doc/compression.txt

doc/config-specs.txt

doc/conflicts.txt

doc/costs.txt

doc/darcs.txt

doc/deadly-sins.txt

doc/default.css

doc/design.txt

doc/extra-commands.txt

doc/formats.txt

doc/hashes.txt

doc/ignore.txt

doc/index.txt

doc/interrupted.txt

doc/intro.txt

doc/inventory.txt

doc/join-branches.txt

doc/kill-version.txt

doc/layers.txt

doc/library-interface.txt

doc/merge.txt

doc/mirroring.txt

doc/monotone.txt

doc/news.txt

doc/optional-edit.txt

doc/partial-commit.txt

doc/pool.txt

doc/purpose.txt

doc/python.txt

doc/quilt.txt

doc/quotes.txt

doc/random.txt

doc/requirements.txt

doc/revfile-annotation.txt

doc/revfile.txt

doc/revision-syntax.txt

doc/rollup.txt

doc/scalability.txt

doc/security.txt

doc/shared-branches.txt

doc/short-demo.txt

doc/split-join-files.txt

doc/supportability.txt

doc/svk.txt

doc/switch-in-branch.txt

doc/tagging.txt

doc/taxonomy.txt

doc/thanks.txt

doc/todo-from-arch.txt

doc/unchanged.txt

doc/unrelated-merge.txt

doc/usability.txt

doc/use-cases.txt

doc/web-interface.txt

doc/workflow.txt

doc/yaml.txt

notes

notes/inventory-v2-sample.xml

notes/inventory-v2.rnc

notes/new-inventory-sample.xml

notes/performance.txt

notes/revfile.txt

notes/schemas.xml

patches

files removed:
COPYING.txt

bzr.ico

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_cache_utf8.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_sftp.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/benchmarks/bench_xml.py

bzrlib/benchmarks/tree_creator

bzrlib/benchmarks/tree_creator/__init__.py

bzrlib/benchmarks/tree_creator/heavily_merged.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/benchmarks/tree_creator/simple_many_commit.py

bzrlib/bundle

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/cache_utf8.py

bzrlib/ignores.py

bzrlib/inter.py

bzrlib/knit.py

bzrlib/lockdir.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/reconcile.py

bzrlib/revisiontree.py

bzrlib/sign_my_commits.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/versioned

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_bundle.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_http.py

bzrlib/tests/branch_implementations/test_locking.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/intertree_implementations

bzrlib/tests/intertree_implementations/__init__.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/lock_helpers.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/test_atomicfile.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_http_response.py

bzrlib/tests/test_ignores.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_status.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_tree.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_version.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/tree_implementations

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_test_trees.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_changes_from.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_locking.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/textfile.py

bzrlib/textmerge.py

bzrlib/transform.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/http

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_pycurl_errors.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/http/response.py

bzrlib/tuned_gzip.py

bzrlib/urlutils.py

bzrlib/version.py

bzrlib/versionedfile.py

bzrlib/weave_commands.py

doc/README.1st

doc/configuration.txt

doc/default.css

doc/index.txt

doc/plugins.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/using_aliases.txt

profile_imports.py

tools/doc_generate/autodoc_rstx.py

tools/rst2html.py

tools/win32

tools/win32/__init__.py

tools/win32/bazaar.url

tools/win32/bzr-win32-bdist-postinstall.py

tools/win32/bzr.iss.cog

tools/win32/bzr_postinstall.py

tools/win32/file_version.py

tools/win32/ostools.py

tools/win32/start_bzr.bat

files renamed:
bzrlib/bundle/__init__.py => bzrlib/_changeset.py

bzrlib/store/versioned/__init__.py => bzrlib/store/weave.py

bzrlib/tests/blackbox/test_annotate.py => bzrlib/tests/test_annotate.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py => bzrlib/tests/test_basis_inventory.py

bzrlib/tests/repository_implementations/test_revision.py => bzrlib/tests/test_revprops.py

bzrlib/tests/blackbox/test_uncommit.py => bzrlib/tests/test_uncommit.py

bzrlib/transport/http/__init__.py => bzrlib/transport/http.py

doc/tutorial.txt => tutorial.txt

files modified:
.bzrignore

BRANCH.TODO

HACKING

Makefile

NEWS

README

TODO

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc/__init__.py

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/progress.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/status.py

bzrlib/store/__init__.py

bzrlib/store/text.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib/newinventory.py

contrib/pwk

generate_docs.py

setup.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/history2revfiles.py

tools/weavebench.py

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

# Written by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

from cStringIO import StringIO

# make GzipFile faster:

import gzip

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

import sys

import struct

import zlib

# we want a \n preserved, break on \n only splitlines.

import bzrlib

__all__ = ["GzipFile"]

class GzipFile(gzip.GzipFile):

"""Knit tuned version of GzipFile.

This is based on the following lsprof stats:

python 2.4 stock GzipFile write:

58971 0 5644.3090 2721.4730 gzip:193(write)

+58971 0 1159.5530 1159.5530 +<built-in method compress>

+176913 0 987.0320 987.0320 +<len>

+58971 0 423.1450 423.1450 +<zlib.crc32>

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

StringO' objects>

tuned GzipFile write:

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

+58971 0 1297.7620 1297.7620 +<built-in method compress>

+58971 0 406.2160 406.2160 +<zlib.crc32>

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

StringO' objects>

+58971 0 328.2670 328.2670 +<len>

Yes, its only 1.6 seconds, but they add up.

"""

def _add_read_data(self, data):

# 4169 calls in 183

# temp var for len(data) and switch to +='s.

# 4169 in 139

len_data = len(data)

self.crc = zlib.crc32(data, self.crc)

self.extrabuf += data

self.extrasize += len_data

self.size += len_data

def _write_gzip_header(self):

"""A tuned version of gzip._write_gzip_header

We have some extra constrains that plain Gzip does not.

1) We want to write the whole blob at once. rather than multiple

calls to fileobj.write().

2) We never have a filename

3) We don't care about the time

"""

self.fileobj.write(

'\037\213' # self.fileobj.write('\037\213') # magic header

'\010' # self.fileobj.write('\010') # compression method

# fname = self.filename[:-3]

# flags = 0

# if fname:

# flags = FNAME

'\x00' # self.fileobj.write(chr(flags))

'\0\0\0\0' # write32u(self.fileobj, long(time.time()))

'\002' # self.fileobj.write('\002')

'\377' # self.fileobj.write('\377')

# if fname:

'' # self.fileobj.write(fname + '\000')

)

def _read(self, size=1024):

# various optimisations:

# reduces lsprof count from 2500 to

# 8337 calls in 1272, 365 internal

if self.fileobj is None:

raise EOFError, "Reached EOF"

if self._new_member:

100

# If the _new_member flag is set, we have to

101

# jump to the next member, if there is one.

102

103

# First, check if we're at the end of the file;

104

# if so, it's time to stop; no more members to read.

105

next_header_bytes = self.fileobj.read(10)

106

if next_header_bytes == '':

107

raise EOFError, "Reached EOF"

108

109

self._init_read()

110

self._read_gzip_header(next_header_bytes)

111

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

112

self._new_member = False

113

114

# Read a chunk of data from the file

115

buf = self.fileobj.read(size)

116

117

# If the EOF has been reached, flush the decompression object

118

# and mark this object as finished.

119

120

if buf == "":

121

self._add_read_data(self.decompress.flush())

122

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

123

self._gzip_tail = self.decompress.unused_data[0:8]

124

self._read_eof()

125

# tell the driving read() call we have stuffed all the data

126

# in self.extrabuf

127

raise EOFError, 'Reached EOF'

128

129

self._add_read_data(self.decompress.decompress(buf))

130

131

if self.decompress.unused_data != "":

132

# Ending case: we've come to the end of a member in the file,

133

# so seek back to the start of the data for the next member which

134

# is the length of the decompress objects unused data - the first

135

# 8 bytes for the end crc and size records.

136

137

# so seek back to the start of the unused data, finish up

138

# this member, and read a new gzip header.

139

# (The number of bytes to seek back is the length of the unused

140

# data, minus 8 because those 8 bytes are part of this member.

141

seek_length = len (self.decompress.unused_data) - 8

142

if seek_length > 0:

143

# we read too much data

144

self.fileobj.seek(-seek_length, 1)

145

self._gzip_tail = self.decompress.unused_data[0:8]

146

elif seek_length < 0:

147

# we haven't read enough to check the checksum.

148

assert -8 < seek_length, "too great a seek."

149

buf = self.fileobj.read(-seek_length)

150

self._gzip_tail = self.decompress.unused_data + buf

151

else:

152

self._gzip_tail = self.decompress.unused_data

153

154

# Check the CRC and file size, and set the flag so we read

155

# a new member on the next call

156

self._read_eof()

157

self._new_member = True

158

159

def _read_eof(self):

160

"""tuned to reduce function calls and eliminate file seeking:

161

pass 1:

162

reduces lsprof count from 800 to 288

163

4168 in 296

164

avoid U32 call by using struct format L

165

4168 in 200

166

"""

167

# We've read to the end of the file, so we should have 8 bytes of

168

# unused data in the decompressor. If we don't, there is a corrupt file.

169

# We use these 8 bytes to calculate the CRC and the recorded file size.

170

# We then check the that the computed CRC and size of the

171

# uncompressed data matches the stored values. Note that the size

172

# stored is the true file size mod 2**32.

173

assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."

174

crc32, isize = struct.unpack("<LL", self._gzip_tail)

175

# note that isize is unsigned - it can exceed 2GB

176

if crc32 != U32(self.crc):

177

raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))

178

elif isize != LOWU32(self.size):

179

raise IOError, "Incorrect length of data produced"

180

181

def _read_gzip_header(self, bytes=None):

182

"""Supply bytes if the minimum header size is already read.

183

184

:param bytes: 10 bytes of header data.

185

"""

186

"""starting cost: 300 in 3998

187

15998 reads from 3998 calls

188

final cost 168

189

"""

190

if bytes is None:

191

bytes = self.fileobj.read(10)

192

magic = bytes[0:2]

193

if magic != '\037\213':

194

raise IOError, 'Not a gzipped file'

195

method = ord(bytes[2:3])

196

if method != 8:

197

raise IOError, 'Unknown compression method'

198

flag = ord(bytes[3:4])

199

# modtime = self.fileobj.read(4) (bytes [4:8])

200

# extraflag = self.fileobj.read(1) (bytes[8:9])

201

# os = self.fileobj.read(1) (bytes[9:10])

202

# self.fileobj.read(6)

203

204

if flag & FEXTRA:

205

# Read & discard the extra field, if present

206

xlen = ord(self.fileobj.read(1))

207

xlen = xlen + 256*ord(self.fileobj.read(1))

208

self.fileobj.read(xlen)

209

if flag & FNAME:

210

# Read and discard a null-terminated string containing the filename

211

while True:

212

s = self.fileobj.read(1)

213

if not s or s=='\000':

214

break

215

if flag & FCOMMENT:

216

# Read and discard a null-terminated string containing a comment

217

while True:

218

s = self.fileobj.read(1)

219

if not s or s=='\000':

220

break

221

if flag & FHCRC:

222

self.fileobj.read(2) # Read & discard the 16-bit header CRC

223

224

def readline(self, size=-1):

225

"""Tuned to remove buffer length calls in _unread and...

226

227

also removes multiple len(c) calls, inlines _unread,

228

total savings - lsprof 5800 to 5300

229

phase 2:

230

4168 calls in 2233

231

8176 calls to read() in 1684

232

changing the min chunk size to 200 halved all the cache misses

233

leading to a drop to:

234

4168 calls in 1977

235

4168 call to read() in 1646

236

- i.e. just reduced the function call overhead. May be worth

237

keeping.

238

"""

239

if size < 0: size = sys.maxint

240

bufs = []

241

readsize = min(200, size) # Read from the file in small chunks

242

while True:

243

if size == 0:

244

return "".join(bufs) # Return resulting line

245

246

# c is the chunk

247

c = self.read(readsize)

248

# number of bytes read

249

len_c = len(c)

250

i = c.find('\n')

251

if size is not None:

252

# We set i=size to break out of the loop under two

253

# conditions: 1) there's no newline, and the chunk is

254

# larger than size, or 2) there is a newline, but the

255

# resulting line would be longer than 'size'.

256

if i==-1 and len_c > size: i=size-1

257

elif size <= i: i = size -1

258

259

if i >= 0 or c == '':

260

# if i>= 0 we have a newline or have triggered the above

261

# if size is not None condition.

262

# if c == '' its EOF.

263

bufs.append(c[:i+1]) # Add portion of last chunk

264

# -- inlined self._unread --

265

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

266

self.extrabuf = c[i+1:] + self.extrabuf

267

self.extrasize = len_c - i + self.extrasize

268

self.offset -= len_c - i

269

# -- end inlined self._unread --

270

return ''.join(bufs) # Return resulting line

271

272

# Append chunk to list, decrease 'size',

273

bufs.append(c)

274

size = size - len_c

275

readsize = min(size, readsize * 2)

276

277

def readlines(self, sizehint=0):

278

# optimise to avoid all the buffer manipulation

279

# lsprof changed from:

280

# 4168 calls in 5472 with 32000 calls to readline()

281

# to :

282

# 4168 calls in 417.

283

# Negative numbers result in reading all the lines

284

285

# python's gzip routine uses sizehint. This is a more efficient way

286

# than python uses to honor it. But it is even more efficient to

287

# just read the entire thing and use cStringIO to split into lines.

288

# if sizehint <= 0:

289

# sizehint = -1

290

# content = self.read(sizehint)

291

# return bzrlib.osutils.split_lines(content)

292

content = StringIO(self.read(-1))

293

return content.readlines()

294

295

def _unread(self, buf, len_buf=None):

296

"""tuned to remove unneeded len calls.

297

298

because this is such an inner routine in readline, and readline is

299

in many inner loops, this has been inlined into readline().

300

301

The len_buf parameter combined with the reduction in len calls dropped

302

the lsprof ms count for this routine on my test data from 800 to 200 -

303

a 75% saving.

304

"""

305

if len_buf is None:

306

len_buf = len(buf)

307

self.extrabuf = buf + self.extrabuf

308

self.extrasize = len_buf + self.extrasize

309

self.offset -= len_buf

310

311

def write(self, data):

312

if self.mode != gzip.WRITE:

313

import errno

314

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

315

316

if self.fileobj is None:

317

raise ValueError, "write() on closed GzipFile object"

318

data_len = len(data)

319

if data_len > 0:

320

self.size = self.size + data_len

321

self.crc = zlib.crc32(data, self.crc)

322

self.fileobj.write( self.compress.compress(data) )

323

self.offset += data_len

324

325

def writelines(self, lines):

326

# profiling indicated a significant overhead

327

# calling write for each line.

328

# this batch call is a lot faster :).

329

# (4 seconds to 1 seconds for the sample upgrades I was testing).

330

self.write(''.join(lines))

331

332

Older »