~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: abentley
Date: 2006-04-20 23:47:53 UTC
mfrom: (1681 +trunk)
mto: This revision was merged to the branch mainline in revision 1683.
Revision ID: abentley@lappy-20060420234753-6a6874b76f09f86d

Merge bzr.dev

files added:
bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/textfile.py

bzrlib/textmerge.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/http

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/tuned_gzip.py

bzrlib/weave_commands.py

tools/doc_generate/autodoc_rstx.py

files renamed:
bzrlib/tests/test_uncommit.py => bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/test_basis_inventory.py => bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/transport/http.py => bzrlib/transport/http/__init__.py

files modified:
BRANCH.TODO

HACKING

NEWS

README

bzrlib/__init__.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/fetch.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/rio.py

bzrlib/status.py

bzrlib/store/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/text.py

bzrlib/store/versioned/__init__.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_store.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/workingtree.py

bzrlib/xml5.py

setup.py

tools/doc_generate/autodoc_man.py

tutorial.txt

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

# Written by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

# make GzipFile faster:

import gzip

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

import sys

import struct

import zlib

# we want a \n preserved, break on \n only splitlines.

import bzrlib

__all__ = ["GzipFile"]

class GzipFile(gzip.GzipFile):

"""Knit tuned version of GzipFile.

This is based on the following lsprof stats:

python 2.4 stock GzipFile write:

58971 0 5644.3090 2721.4730 gzip:193(write)

+58971 0 1159.5530 1159.5530 +<built-in method compress>

+176913 0 987.0320 987.0320 +<len>

+58971 0 423.1450 423.1450 +<zlib.crc32>

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

StringO' objects>

tuned GzipFile write:

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

+58971 0 1297.7620 1297.7620 +<built-in method compress>

+58971 0 406.2160 406.2160 +<zlib.crc32>

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

StringO' objects>

+58971 0 328.2670 328.2670 +<len>

Yes, its only 1.6 seconds, but they add up.

"""

def _add_read_data(self, data):

# 4169 calls in 183

# temp var for len(data) and switch to +='s.

# 4169 in 139

len_data = len(data)

self.crc = zlib.crc32(data, self.crc)

self.extrabuf += data

self.extrasize += len_data

self.size += len_data

def _read(self, size=1024):

# various optimisations:

# reduces lsprof count from 2500 to

# 8337 calls in 1272, 365 internal

if self.fileobj is None:

raise EOFError, "Reached EOF"

if self._new_member:

# If the _new_member flag is set, we have to

# jump to the next member, if there is one.

# First, check if we're at the end of the file;

# if so, it's time to stop; no more members to read.

next_header_bytes = self.fileobj.read(10)

if next_header_bytes == '':

raise EOFError, "Reached EOF"

self._init_read()

self._read_gzip_header(next_header_bytes)

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

self._new_member = False

# Read a chunk of data from the file

buf = self.fileobj.read(size)

# If the EOF has been reached, flush the decompression object

# and mark this object as finished.

if buf == "":

self._add_read_data(self.decompress.flush())

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

self._gzip_tail = self.decompress.unused_data[0:8]

self._read_eof()

# tell the driving read() call we have stuffed all the data

100

# in self.extrabuf

101

raise EOFError, 'Reached EOF'

102

103

self._add_read_data(self.decompress.decompress(buf))

104

105

if self.decompress.unused_data != "":

106

# Ending case: we've come to the end of a member in the file,

107

# so seek back to the start of the data for the next member which

108

# is the length of the decompress objects unused data - the first

109

# 8 bytes for the end crc and size records.

110

111

# so seek back to the start of the unused data, finish up

112

# this member, and read a new gzip header.

113

# (The number of bytes to seek back is the length of the unused

114

# data, minus 8 because those 8 bytes are part of this member.

115

seek_length = len (self.decompress.unused_data) - 8

116

if seek_length > 0:

117

# we read too much data

118

self.fileobj.seek(-seek_length, 1)

119

self._gzip_tail = self.decompress.unused_data[0:8]

120

elif seek_length < 0:

121

# we haven't read enough to check the checksum.

122

assert -8 < seek_length, "too great a seek."

123

buf = self.fileobj.read(-seek_length)

124

self._gzip_tail = self.decompress.unused_data + buf

125

else:

126

self._gzip_tail = self.decompress.unused_data

127

128

# Check the CRC and file size, and set the flag so we read

129

# a new member on the next call

130

self._read_eof()

131

self._new_member = True

132

133

def _read_eof(self):

134

"""tuned to reduce function calls and eliminate file seeking:

135

pass 1:

136

reduces lsprof count from 800 to 288

137

4168 in 296

138

avoid U32 call by using struct format L

139

4168 in 200

140

"""

141

# We've read to the end of the file, so we should have 8 bytes of

142

# unused data in the decompressor. If we dont, there is a corrupt file.

143

# We use these 8 bytes to calculate the CRC and the recorded file size.

144

# We then check the that the computed CRC and size of the

145

# uncompressed data matches the stored values. Note that the size

146

# stored is the true file size mod 2**32.

147

assert len(self._gzip_tail) == 8, "gzip trailer is incorrect length."

148

crc32, isize = struct.unpack("<LL", self._gzip_tail)

149

# note that isize is unsigned - it can exceed 2GB

150

if crc32 != U32(self.crc):

151

raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))

152

elif isize != LOWU32(self.size):

153

raise IOError, "Incorrect length of data produced"

154

155

def _read_gzip_header(self, bytes=None):

156

"""Supply bytes if the minimum header size is already read.

157

158

:param bytes: 10 bytes of header data.

159

"""

160

"""starting cost: 300 in 3998

161

15998 reads from 3998 calls

162

final cost 168

163

"""

164

if bytes is None:

165

bytes = self.fileobj.read(10)

166

magic = bytes[0:2]

167

if magic != '\037\213':

168

raise IOError, 'Not a gzipped file'

169

method = ord(bytes[2:3])

170

if method != 8:

171

raise IOError, 'Unknown compression method'

172

flag = ord(bytes[3:4])

173

# modtime = self.fileobj.read(4) (bytes [4:8])

174

# extraflag = self.fileobj.read(1) (bytes[8:9])

175

# os = self.fileobj.read(1) (bytes[9:10])

176

# self.fileobj.read(6)

177

178

if flag & FEXTRA:

179

# Read & discard the extra field, if present

180

xlen = ord(self.fileobj.read(1))

181

xlen = xlen + 256*ord(self.fileobj.read(1))

182

self.fileobj.read(xlen)

183

if flag & FNAME:

184

# Read and discard a null-terminated string containing the filename

185

while True:

186

s = self.fileobj.read(1)

187

if not s or s=='\000':

188

break

189

if flag & FCOMMENT:

190

# Read and discard a null-terminated string containing a comment

191

while True:

192

s = self.fileobj.read(1)

193

if not s or s=='\000':

194

break

195

if flag & FHCRC:

196

self.fileobj.read(2) # Read & discard the 16-bit header CRC

197

198

def readline(self, size=-1):

199

"""Tuned to remove buffer length calls in _unread and...

200

201

also removes multiple len(c) calls, inlines _unread,

202

total savings - lsprof 5800 to 5300

203

phase 2:

204

4168 calls in 2233

205

8176 calls to read() in 1684

206

changing the min chunk size to 200 halved all the cache misses

207

leading to a drop to:

208

4168 calls in 1977

209

4168 call to read() in 1646

210

- i.e. just reduced the function call overhead. May be worth

211

keeping.

212

"""

213

if size < 0: size = sys.maxint

214

bufs = []

215

readsize = min(200, size) # Read from the file in small chunks

216

while True:

217

if size == 0:

218

return "".join(bufs) # Return resulting line

219

220

# c is the chunk

221

c = self.read(readsize)

222

# number of bytes read

223

len_c = len(c)

224

i = c.find('\n')

225

if size is not None:

226

# We set i=size to break out of the loop under two

227

# conditions: 1) there's no newline, and the chunk is

228

# larger than size, or 2) there is a newline, but the

229

# resulting line would be longer than 'size'.

230

if i==-1 and len_c > size: i=size-1

231

elif size <= i: i = size -1

232

233

if i >= 0 or c == '':

234

# if i>= 0 we have a newline or have triggered the above

235

# if size is not None condition.

236

# if c == '' its EOF.

237

bufs.append(c[:i+1]) # Add portion of last chunk

238

# -- inlined self._unread --

239

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

240

self.extrabuf = c[i+1:] + self.extrabuf

241

self.extrasize = len_c - i + self.extrasize

242

self.offset -= len_c - i

243

# -- end inlined self._unread --

244

return ''.join(bufs) # Return resulting line

245

246

# Append chunk to list, decrease 'size',

247

bufs.append(c)

248

size = size - len_c

249

readsize = min(size, readsize * 2)

250

251

def readlines(self, sizehint=0):

252

# optimise to avoid all the buffer manipulation

253

# lsprof changed from:

254

# 4168 calls in 5472 with 32000 calls to readline()

255

# to :

256

# 4168 calls in 417.

257

# Negative numbers result in reading all the lines

258

if sizehint <= 0:

259

sizehint = -1

260

content = self.read(sizehint)

261

return bzrlib.osutils.split_lines(content)

262

263

def _unread(self, buf, len_buf=None):

264

"""tuned to remove unneeded len calls.

265

266

because this is such an inner routine in readline, and readline is

267

in many inner loops, this has been inlined into readline().

268

269

The len_buf parameter combined with the reduction in len calls dropped

270

the lsprof ms count for this routine on my test data from 800 to 200 -

271

a 75% saving.

272

"""

273

if len_buf is None:

274

len_buf = len(buf)

275

self.extrabuf = buf + self.extrabuf

276

self.extrasize = len_buf + self.extrasize

277

self.offset -= len_buf

278

279

def write(self, data):

280

if self.mode != gzip.WRITE:

281

import errno

282

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

283

284

if self.fileobj is None:

285

raise ValueError, "write() on closed GzipFile object"

286

data_len = len(data)

287

if data_len > 0:

288

self.size = self.size + data_len

289

self.crc = zlib.crc32(data, self.crc)

290

self.fileobj.write( self.compress.compress(data) )

291

self.offset += data_len

292

293

def writelines(self, lines):

294

# profiling indicated a significant overhead

295

# calling write for each line.

296

# this batch call is a lot faster :).

297

# (4 seconds to 1 seconds for the sample upgrades I was testing).

298

self.write(''.join(lines))

299

300

Older »