~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: Vincent Ladeuil
Date: 2013-07-13 19:05:24 UTC
mto: This revision was merged to the branch mainline in revision 6580.
Revision ID: v.ladeuil+lp@free.fr-20130713190524-3bclzq4hpwkd6hkw

Urgh. pqm still runs python 2.6 so we have to maintain compatibility to land the fix 8-(

files modified:
bzrlib/tuned_gzip.py

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

127

DeprecationWarning, stacklevel=2)

128

gzip.GzipFile.__init__(self, *args, **kwargs)

129

130

def _add_read_data(self, data):

131

# 4169 calls in 183

132

# temp var for len(data) and switch to +='s.

133

# 4169 in 139

134

len_data = len(data)

135

self.crc = zlib.crc32(data, self.crc) & 0xffffffffL

136

offset = self.offset - self.extrastart

137

self.extrabuf = self.extrabuf[offset:] + data

138

self.extrasize = self.extrasize + len_data

139

self.extrastart = self.offset

140

self.size = self.size + len_data

130

if sys.version_info >= (2, 7, 4):

131

def _add_read_data(self, data):

132

# 4169 calls in 183

133

# temp var for len(data) and switch to +='s.

134

# 4169 in 139

135

len_data = len(data)

136

self.crc = zlib.crc32(data, self.crc) & 0xffffffffL

137

offset = self.offset - self.extrastart

138

self.extrabuf = self.extrabuf[offset:] + data

139

self.extrasize = self.extrasize + len_data

140

self.extrastart = self.offset

141

self.size = self.size + len_data

142

else:

143

def _add_read_data(self, data):

144

# 4169 calls in 183

145

# temp var for len(data) and switch to +='s.

146

# 4169 in 139

147

len_data = len(data)

148

self.crc = zlib.crc32(data, self.crc)

149

self.extrabuf += data

150

self.extrasize += len_data

151

self.size += len_data

141

152

142

153

def _write_gzip_header(self):

143

154

"""A tuned version of gzip._write_gzip_header

163

174

'' # self.fileobj.write(fname + '\000')

164

175

)

165

176

177

if sys.version_info < (2, 7, 4):

178

def _read(self, size=1024):

179

# various optimisations:

180

# reduces lsprof count from 2500 to

181

# 8337 calls in 1272, 365 internal

182

if self.fileobj is None:

183

raise EOFError, "Reached EOF"

184

185

if self._new_member:

186

# If the _new_member flag is set, we have to

187

# jump to the next member, if there is one.

188

189

# First, check if we're at the end of the file;

190

# if so, it's time to stop; no more members to read.

191

next_header_bytes = self.fileobj.read(10)

192

if next_header_bytes == '':

193

raise EOFError, "Reached EOF"

194

195

self._init_read()

196

self._read_gzip_header(next_header_bytes)

197

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

198

self._new_member = False

199

200

# Read a chunk of data from the file

201

buf = self.fileobj.read(size)

202

203

# If the EOF has been reached, flush the decompression object

204

# and mark this object as finished.

205

206

if buf == "":

207

self._add_read_data(self.decompress.flush())

208

if len(self.decompress.unused_data) < 8:

209

raise AssertionError("what does flush do?")

210

self._gzip_tail = self.decompress.unused_data[0:8]

211

self._read_eof()

212

# tell the driving read() call we have stuffed all the data

213

# in self.extrabuf

214

raise EOFError, 'Reached EOF'

215

216

self._add_read_data(self.decompress.decompress(buf))

217

218

if self.decompress.unused_data != "":

219

# Ending case: we've come to the end of a member in the file,

220

# so seek back to the start of the data for the next member

221

# which is the length of the decompress objects unused data -

222

# the first 8 bytes for the end crc and size records.

223

224

# so seek back to the start of the unused data, finish up

225

# this member, and read a new gzip header.

226

# (The number of bytes to seek back is the length of the unused

227

# data, minus 8 because those 8 bytes are part of this member.

228

seek_length = len (self.decompress.unused_data) - 8

229

if seek_length > 0:

230

# we read too much data

231

self.fileobj.seek(-seek_length, 1)

232

self._gzip_tail = self.decompress.unused_data[0:8]

233

elif seek_length < 0:

234

# we haven't read enough to check the checksum.

235

if not (-8 < seek_length):

236

raise AssertionError("too great a seek")

237

buf = self.fileobj.read(-seek_length)

238

self._gzip_tail = self.decompress.unused_data + buf

239

else:

240

self._gzip_tail = self.decompress.unused_data

241

242

# Check the CRC and file size, and set the flag so we read

243

# a new member on the next call

244

self._read_eof()

245

self._new_member = True

246

247

def _read_eof(self):

248

"""tuned to reduce function calls and eliminate file seeking:

249

pass 1:

250

reduces lsprof count from 800 to 288

251

4168 in 296

252

avoid U32 call by using struct format L

253

4168 in 200

254

"""

255

# We've read to the end of the file, so we should have 8 bytes of

256

# unused data in the decompressor. If we don't, there is a corrupt

257

# file. We use these 8 bytes to calculate the CRC and the recorded

258

# file size. We then check the that the computed CRC and size of

259

# the uncompressed data matches the stored values. Note that the

260

# size stored is the true file size mod 2**32.

261

if not (len(self._gzip_tail) == 8):

262

raise AssertionError("gzip trailer is incorrect length.")

263

crc32, isize = struct.unpack("<LL", self._gzip_tail)

264

# note that isize is unsigned - it can exceed 2GB

265

if crc32 != U32(self.crc):

266

raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))

267

elif isize != LOWU32(self.size):

268

raise IOError, "Incorrect length of data produced"

269

166

270

def _read_gzip_header(self, bytes=None):

167

271

"""Supply bytes if the minimum header size is already read.

168

272

Older »