1
# Copyright (C) 2006-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Handlers for HTTP Responses.
19
The purpose of these classes is to provide a uniform interface for clients
20
to standard HTTP responses, single range responses and multipart range
24
from __future__ import absolute_import
28
from cStringIO import StringIO
37
class ResponseFile(object):
38
"""A wrapper around the http socket containing the result of a GET request.
40
Only read() and seek() (forward) are supported.
43
def __init__(self, path, infile):
46
:param path: File url, for error reports.
48
:param infile: File-like socket set at body start.
57
Dummy implementation for consistency with the 'file' API.
60
def read(self, size=-1):
61
"""Read size bytes from the current position in the file.
63
:param size: The number of bytes to read. Leave unspecified or pass
66
data = self._file.read(size)
67
self._pos += len(data)
71
data = self._file.readline()
72
self._pos += len(data)
77
line = self.readline()
85
def seek(self, offset, whence=os.SEEK_SET):
86
if whence == os.SEEK_SET:
87
if offset < self._pos:
89
"Can't seek backwards, pos: %s, offset: %s"
90
% (self._pos, offset))
91
to_discard = offset - self._pos
92
elif whence == os.SEEK_CUR:
95
raise AssertionError("Can't seek backwards")
97
# Just discard the unwanted bytes
100
# A RangeFile expects the following grammar (simplified to outline the
101
# assumptions we rely upon).
106
# single_range: content_range_header data
108
# multiple_range: boundary_header boundary (content_range_header data boundary)+
110
class RangeFile(ResponseFile):
111
"""File-like object that allow access to partial available data.
113
All accesses should happen sequentially since the acquisition occurs during
114
an http response reception (as sockets can't be seeked, we simulate the
115
seek by just reading and discarding the data).
117
The access pattern is defined by a set of ranges discovered as reading
118
progress. Only one range is available at a given time, so all accesses
119
should happen with monotonically increasing offsets.
122
# in _checked_read() below, we may have to discard several MB in the worst
123
# case. To avoid buffering that much, we read and discard by chunks
124
# instead. The underlying file is either a socket or a StringIO, so reading
125
# 8k chunks should be fine.
126
_discarded_buf_size = 8192
128
# maximum size of read requests -- used to avoid MemoryError issues in recv
129
_max_read_size = 512 * 1024
131
def __init__(self, path, infile):
134
:param path: File url, for error reports.
136
:param infile: File-like socket set at body start.
138
super(RangeFile, self).__init__(path, infile)
139
self._boundary = None
140
# When using multi parts response, this will be set with the headers
141
# associated with the range currently read.
143
# Default to the whole file of unspecified size
144
self.set_range(0, -1)
146
def set_range(self, start, size):
147
"""Change the range mapping"""
150
# Set the new _pos since that's what we want to expose
151
self._pos = self._start
153
def set_boundary(self, boundary):
154
"""Define the boundary used in a multi parts message.
156
The file should be at the beginning of the body, the first range
157
definition is read and taken into account.
159
self._boundary = boundary
160
# Decode the headers and setup the first range
162
self.read_range_definition()
164
def read_boundary(self):
165
"""Read the boundary headers defining a new range"""
166
boundary_line = '\r\n'
167
while boundary_line == '\r\n':
168
# RFC2616 19.2 Additional CRLFs may precede the first boundary
170
# To be on the safe side we allow it before any boundary line
171
boundary_line = self._file.readline()
173
if boundary_line == '':
174
# A timeout in the proxy server caused the response to end early.
175
# See launchpad bug 198646.
176
raise errors.HttpBoundaryMissing(
180
if boundary_line != '--' + self._boundary + '\r\n':
181
# rfc822.unquote() incorrectly unquotes strings enclosed in <>
182
# IIS 6 and 7 incorrectly wrap boundary strings in <>
183
# together they make a beautiful bug, which we will be gracious
185
if (self._unquote_boundary(boundary_line) !=
186
'--' + self._boundary + '\r\n'):
187
raise errors.InvalidHttpResponse(
189
"Expected a boundary (%s) line, got '%s'"
190
% (self._boundary, boundary_line))
192
def _unquote_boundary(self, b):
193
return b[:2] + rfc822.unquote(b[2:-2]) + b[-2:]
195
def read_range_definition(self):
196
"""Read a new range definition in a multi parts message.
198
Parse the headers including the empty line following them so that we
199
are ready to read the data itself.
201
self._headers = httplib.HTTPMessage(self._file, seekable=0)
202
# Extract the range definition
203
content_range = self._headers.getheader('content-range', None)
204
if content_range is None:
205
raise errors.InvalidHttpResponse(
207
'Content-Range header missing in a multi-part response')
208
self.set_range_from_header(content_range)
210
def set_range_from_header(self, content_range):
211
"""Helper to set the new range from its description in the headers"""
213
rtype, values = content_range.split()
215
raise errors.InvalidHttpRange(self._path, content_range,
218
raise errors.InvalidHttpRange(self._path, content_range,
219
"Unsupported range type '%s'" % rtype)
221
# We don't need total, but note that it may be either the file size
222
# or '*' if the server can't or doesn't want to return the file
224
start_end, total = values.split('/')
225
start, end = start_end.split('-')
229
raise errors.InvalidHttpRange(self._path, content_range,
230
'Invalid range values')
231
size = end - start + 1
233
raise errors.InvalidHttpRange(self._path, content_range,
234
'Invalid range, size <= 0')
235
self.set_range(start, size)
237
def _checked_read(self, size):
238
"""Read the file checking for short reads.
240
The data read is discarded along the way.
245
data = self._file.read(min(remaining, self._discarded_buf_size))
246
remaining -= len(data)
248
raise errors.ShortReadvError(self._path, pos, size,
252
def _seek_to_next_range(self):
253
# We will cross range boundaries
254
if self._boundary is None:
255
# If we don't have a boundary, we can't find another range
256
raise errors.InvalidRange(self._path, self._pos,
257
"Range (%s, %s) exhausted"
258
% (self._start, self._size))
260
self.read_range_definition()
262
def read(self, size=-1):
263
"""Read size bytes from the current position in the file.
265
Reading across ranges is not supported. We rely on the underlying http
266
client to clean the socket if we leave bytes unread. This may occur for
267
the final boundary line of a multipart response or for any range
268
request not entirely consumed by the client (due to offset coalescing)
270
:param size: The number of bytes to read. Leave unspecified or pass
274
and self._pos == self._start + self._size):
278
self._seek_to_next_range()
279
elif self._pos < self._start:
280
raise errors.InvalidRange(
281
self._path, self._pos,
282
"Can't read %s bytes before range (%s, %s)"
283
% (size, self._start, self._size))
285
if size > 0 and self._pos + size > self._start + self._size:
286
raise errors.InvalidRange(
287
self._path, self._pos,
288
"Can't read %s bytes across range (%s, %s)"
289
% (size, self._start, self._size))
291
# read data from file
295
# Don't read past the range definition
296
limited = self._start + self._size - self._pos
298
limited = min(limited, size)
299
osutils.pumpfile(self._file, buf, limited, self._max_read_size)
300
data = buf.getvalue()
302
# Update _pos respecting the data effectively read
303
self._pos += len(data)
306
def seek(self, offset, whence=0):
307
start_pos = self._pos
311
final_pos = start_pos + offset
314
final_pos = self._start + self._size + offset # offset < 0
316
raise errors.InvalidRange(
317
self._path, self._pos,
318
"RangeFile: can't seek from end while size is unknown")
320
raise ValueError("Invalid value %s for whence." % whence)
322
if final_pos < self._pos:
323
# Can't seek backwards
324
raise errors.InvalidRange(
325
self._path, self._pos,
326
'RangeFile: trying to seek backwards to %s' % final_pos)
329
cur_limit = self._start + self._size
330
while final_pos > cur_limit:
331
# We will cross range boundaries
332
remain = cur_limit - self._pos
334
# Finish reading the current range
335
self._checked_read(remain)
336
self._seek_to_next_range()
337
cur_limit = self._start + self._size
339
size = final_pos - self._pos
340
if size > 0: # size can be < 0 if we crossed a range boundary
341
# We don't need the data, just read it and throw it away
342
self._checked_read(size)
348
def handle_response(url, code, msg, data):
349
"""Interpret the code & headers and wrap the provided data in a RangeFile.
351
This is a factory method which returns an appropriate RangeFile based on
352
the code & headers it's given.
354
:param url: The url being processed. Mostly for error reporting
355
:param code: The integer HTTP response code
356
:param msg: An HTTPMessage containing the headers for the response
357
:param data: A file-like object that can be read() to get the
359
:return: A file-like object that can seek()+read() the
360
ranges indicated by the headers.
364
rfile = ResponseFile(url, data)
366
rfile = RangeFile(url, data)
367
content_type = msg.getheader('content-type', None)
368
if content_type is None:
369
# When there is no content-type header we treat the response as
370
# being of type 'application/octet-stream' as per RFC2616 section
372
# Therefore it is obviously not multipart
373
content_type = 'application/octet-stream'
376
is_multipart = (msg.getmaintype() == 'multipart'
377
and msg.getsubtype() == 'byteranges')
380
# Full fledged multipart response
381
rfile.set_boundary(msg.getparam('boundary'))
383
# A response to a range request, but not multipart
384
content_range = msg.getheader('content-range', None)
385
if content_range is None:
386
raise errors.InvalidHttpResponse(url,
387
'Missing the Content-Range header in a 206 range response')
388
rfile.set_range_from_header(content_range)
390
raise errors.InvalidHttpResponse(url,
391
'Unknown response code %s' % code)