24
from __future__ import absolute_import
26
from bisect import bisect
27
28
from cStringIO import StringIO
30
from bzrlib import errors
31
from bzrlib.trace import mutter
34
class ResponseRange(object):
35
"""A range in a RangeFile-object."""
37
__slots__ = ['_ent_start', '_ent_end', '_data_start']
39
def __init__(self, ent_start, ent_end, data_start):
40
self._ent_start = ent_start
41
self._ent_end = ent_end
42
self._data_start = data_start
44
def __cmp__(self, other):
45
"""Compare this to other.
47
We need this both for sorting, and so that we can
48
bisect the list of ranges.
37
class ResponseFile(object):
38
"""A wrapper around the http socket containing the result of a GET request.
40
Only read() and seek() (forward) are supported.
43
def __init__(self, path, infile):
46
:param path: File url, for error reports.
48
:param infile: File-like socket set at body start.
50
if isinstance(other, int):
51
# Later on we bisect for a starting point
52
# so we allow comparing against a single integer
53
return cmp(self._ent_start, other)
55
return cmp((self._ent_start, self._ent_end, self._data_start),
56
(other._ent_start, other._ent_end, other._data_start))
59
return "%s(%s-%s,%s)" % (self.__class__.__name__,
60
self._ent_start, self._ent_end,
66
class RangeFile(object):
67
"""File-like object that allow access to partial available data.
69
Specified by a set of ranges.
72
def __init__(self, path, input_file):
77
self._data = input_file.read()
79
def _add_range(self, ent_start, ent_end, data_start):
80
"""Add an entity range.
82
:param ent_start: Start offset of entity
83
:param ent_end: End offset of entity (inclusive)
84
:param data_start: Start offset of data in data stream.
86
self._ranges.append(ResponseRange(ent_start, ent_end, data_start))
87
self._len = max(self._len, ent_end)
89
def _finish_ranges(self):
93
"""Read size bytes from the current position in the file.
95
Reading across ranges is not supported.
97
# find the last range which has a start <= pos
98
i = bisect(self._ranges, self._pos) - 1
100
if i < 0 or self._pos > self._ranges[i]._ent_end:
101
mutter('Bisect for pos: %s failed. Found offset: %d, ranges:%s',
102
self._pos, i, self._ranges)
103
raise errors.InvalidRange(self._path, self._pos)
107
# mutter('found range %s %s for pos %s', i, self._ranges[i], self._pos)
109
if (self._pos + size - 1) > r._ent_end:
110
raise errors.InvalidRange(self._path, self._pos)
112
start = r._data_start + (self._pos - r._ent_start)
114
# mutter("range read %d bytes at %d == %d-%d", size, self._pos,
116
self._pos += (end-start)
117
return self._data[start:end]
57
Dummy implementation for consistency with the 'file' API.
60
def read(self, size=-1):
61
"""Read size bytes from the current position in the file.
63
:param size: The number of bytes to read. Leave unspecified or pass
66
data = self._file.read(size)
67
self._pos += len(data)
71
data = self._file.readline()
72
self._pos += len(data)
77
line = self.readline()
85
def seek(self, offset, whence=os.SEEK_SET):
86
if whence == os.SEEK_SET:
87
if offset < self._pos:
89
"Can't seek backwards, pos: %s, offset: %s"
90
% (self._pos, offset))
91
to_discard = offset - self._pos
92
elif whence == os.SEEK_CUR:
95
raise AssertionError("Can't seek backwards")
97
# Just discard the unwanted bytes
100
# A RangeFile expects the following grammar (simplified to outline the
101
# assumptions we rely upon).
106
# single_range: content_range_header data
108
# multiple_range: boundary_header boundary (content_range_header data boundary)+
110
class RangeFile(ResponseFile):
111
"""File-like object that allow access to partial available data.
113
All accesses should happen sequentially since the acquisition occurs during
114
an http response reception (as sockets can't be seeked, we simulate the
115
seek by just reading and discarding the data).
117
The access pattern is defined by a set of ranges discovered as reading
118
progress. Only one range is available at a given time, so all accesses
119
should happen with monotonically increasing offsets.
122
# in _checked_read() below, we may have to discard several MB in the worst
123
# case. To avoid buffering that much, we read and discard by chunks
124
# instead. The underlying file is either a socket or a StringIO, so reading
125
# 8k chunks should be fine.
126
_discarded_buf_size = 8192
128
# maximum size of read requests -- used to avoid MemoryError issues in recv
129
_max_read_size = 512 * 1024
131
def __init__(self, path, infile):
134
:param path: File url, for error reports.
136
:param infile: File-like socket set at body start.
138
super(RangeFile, self).__init__(path, infile)
139
self._boundary = None
140
# When using multi parts response, this will be set with the headers
141
# associated with the range currently read.
143
# Default to the whole file of unspecified size
144
self.set_range(0, -1)
146
def set_range(self, start, size):
147
"""Change the range mapping"""
150
# Set the new _pos since that's what we want to expose
151
self._pos = self._start
153
def set_boundary(self, boundary):
154
"""Define the boundary used in a multi parts message.
156
The file should be at the beginning of the body, the first range
157
definition is read and taken into account.
159
self._boundary = boundary
160
# Decode the headers and setup the first range
162
self.read_range_definition()
164
def read_boundary(self):
165
"""Read the boundary headers defining a new range"""
166
boundary_line = '\r\n'
167
while boundary_line == '\r\n':
168
# RFC2616 19.2 Additional CRLFs may precede the first boundary
170
# To be on the safe side we allow it before any boundary line
171
boundary_line = self._file.readline()
173
if boundary_line == '':
174
# A timeout in the proxy server caused the response to end early.
175
# See launchpad bug 198646.
176
raise errors.HttpBoundaryMissing(
180
if boundary_line != '--' + self._boundary + '\r\n':
181
# rfc822.unquote() incorrectly unquotes strings enclosed in <>
182
# IIS 6 and 7 incorrectly wrap boundary strings in <>
183
# together they make a beautiful bug, which we will be gracious
185
if (self._unquote_boundary(boundary_line) !=
186
'--' + self._boundary + '\r\n'):
187
raise errors.InvalidHttpResponse(
189
"Expected a boundary (%s) line, got '%s'"
190
% (self._boundary, boundary_line))
192
def _unquote_boundary(self, b):
193
return b[:2] + rfc822.unquote(b[2:-2]) + b[-2:]
195
def read_range_definition(self):
196
"""Read a new range definition in a multi parts message.
198
Parse the headers including the empty line following them so that we
199
are ready to read the data itself.
201
self._headers = httplib.HTTPMessage(self._file, seekable=0)
202
# Extract the range definition
203
content_range = self._headers.getheader('content-range', None)
204
if content_range is None:
205
raise errors.InvalidHttpResponse(
207
'Content-Range header missing in a multi-part response')
208
self.set_range_from_header(content_range)
210
def set_range_from_header(self, content_range):
211
"""Helper to set the new range from its description in the headers"""
213
rtype, values = content_range.split()
215
raise errors.InvalidHttpRange(self._path, content_range,
218
raise errors.InvalidHttpRange(self._path, content_range,
219
"Unsupported range type '%s'" % rtype)
221
# We don't need total, but note that it may be either the file size
222
# or '*' if the server can't or doesn't want to return the file
224
start_end, total = values.split('/')
225
start, end = start_end.split('-')
229
raise errors.InvalidHttpRange(self._path, content_range,
230
'Invalid range values')
231
size = end - start + 1
233
raise errors.InvalidHttpRange(self._path, content_range,
234
'Invalid range, size <= 0')
235
self.set_range(start, size)
237
def _checked_read(self, size):
238
"""Read the file checking for short reads.
240
The data read is discarded along the way.
245
data = self._file.read(min(remaining, self._discarded_buf_size))
246
remaining -= len(data)
248
raise errors.ShortReadvError(self._path, pos, size,
252
def _seek_to_next_range(self):
253
# We will cross range boundaries
254
if self._boundary is None:
255
# If we don't have a boundary, we can't find another range
256
raise errors.InvalidRange(self._path, self._pos,
257
"Range (%s, %s) exhausted"
258
% (self._start, self._size))
260
self.read_range_definition()
262
def read(self, size=-1):
263
"""Read size bytes from the current position in the file.
265
Reading across ranges is not supported. We rely on the underlying http
266
client to clean the socket if we leave bytes unread. This may occur for
267
the final boundary line of a multipart response or for any range
268
request not entirely consumed by the client (due to offset coalescing)
270
:param size: The number of bytes to read. Leave unspecified or pass
274
and self._pos == self._start + self._size):
278
self._seek_to_next_range()
279
elif self._pos < self._start:
280
raise errors.InvalidRange(
281
self._path, self._pos,
282
"Can't read %s bytes before range (%s, %s)"
283
% (size, self._start, self._size))
285
if size > 0 and self._pos + size > self._start + self._size:
286
raise errors.InvalidRange(
287
self._path, self._pos,
288
"Can't read %s bytes across range (%s, %s)"
289
% (size, self._start, self._size))
291
# read data from file
295
# Don't read past the range definition
296
limited = self._start + self._size - self._pos
298
limited = min(limited, size)
299
osutils.pumpfile(self._file, buf, limited, self._max_read_size)
300
data = buf.getvalue()
302
# Update _pos respecting the data effectively read
303
self._pos += len(data)
119
306
def seek(self, offset, whence=0):
307
start_pos = self._pos
122
310
elif whence == 1:
311
final_pos = start_pos + offset
124
312
elif whence == 2:
125
self._pos = self._len + offset
314
final_pos = self._start + self._size + offset # offset < 0
316
raise errors.InvalidRange(
317
self._path, self._pos,
318
"RangeFile: can't seek from end while size is unknown")
127
320
raise ValueError("Invalid value %s for whence." % whence)
322
if final_pos < self._pos:
323
# Can't seek backwards
324
raise errors.InvalidRange(
325
self._path, self._pos,
326
'RangeFile: trying to seek backwards to %s' % final_pos)
329
cur_limit = self._start + self._size
330
while final_pos > cur_limit:
331
# We will cross range boundaries
332
remain = cur_limit - self._pos
334
# Finish reading the current range
335
self._checked_read(remain)
336
self._seek_to_next_range()
337
cur_limit = self._start + self._size
339
size = final_pos - self._pos
340
if size > 0: # size can be < 0 if we crossed a range boundary
341
# We don't need the data, just read it and throw it away
342
self._checked_read(size)
136
class HttpRangeResponse(RangeFile):
137
"""A single-range HTTP response."""
139
# TODO: jam 20060706 Consider compiling these regexes on demand
140
_CONTENT_RANGE_RE = re.compile(
141
r'\s*([^\s]+)\s+([0-9]+)-([0-9]+)/([0-9]+)\s*$')
143
def __init__(self, path, content_range, input_file):
144
# mutter("parsing 206 non-multipart response for %s", path)
145
RangeFile.__init__(self, path, input_file)
146
start, end = self._parse_range(content_range, path)
147
self._add_range(start, end, 0)
148
self._finish_ranges()
151
def _parse_range(range, path='<unknown>'):
152
"""Parse an http Content-range header and return start + end
154
:param range: The value for Content-range
155
:param path: Provide to give better error messages.
156
:return: (start, end) A tuple of integers
158
match = HttpRangeResponse._CONTENT_RANGE_RE.match(range)
160
raise errors.InvalidHttpRange(path, range,
161
"Invalid Content-range")
163
rtype, start, end, total = match.groups()
166
raise errors.InvalidHttpRange(path, range,
167
"Unsupported range type '%s'" % (rtype,))
172
except ValueError, e:
173
raise errors.InvalidHttpRange(path, range, str(e))
178
class HttpMultipartRangeResponse(RangeFile):
179
"""A multi-range HTTP response."""
181
_CONTENT_TYPE_RE = re.compile(
182
r'^\s*multipart/byteranges\s*;\s*boundary\s*=\s*("?)([^"]*?)\1\s*$')
184
# Start with --<boundary>\r\n
185
# and ignore all headers ending in \r\n
186
# except for content-range:
187
# and find the two trailing \r\n separators
188
# indicating the start of the text
189
# TODO: jam 20060706 This requires exact conformance
190
# to the spec, we probably could relax the requirement
191
# of \r\n, and use something more like (\r?\n)
193
"^--%s(?:\r\n(?:(?:content-range:([^\r]+))|[^\r]+))+\r\n\r\n")
195
def __init__(self, path, content_type, input_file):
196
# mutter("parsing 206 multipart response for %s", path)
197
# TODO: jam 20060706 Is it valid to initialize a
198
# grandparent without initializing parent?
199
RangeFile.__init__(self, path, input_file)
201
self.boundary_regex = self._parse_boundary(content_type, path)
202
# mutter('response:\n%r', self._data)
204
for match in self.boundary_regex.finditer(self._data):
205
ent_start, ent_end = HttpRangeResponse._parse_range(match.group(1),
207
self._add_range(ent_start, ent_end, match.end())
209
self._finish_ranges()
212
def _parse_boundary(ctype, path='<unknown>'):
213
"""Parse the Content-type field.
215
This expects a multipart Content-type, and returns a
216
regex which is capable of finding the boundaries
217
in the multipart data.
219
match = HttpMultipartRangeResponse._CONTENT_TYPE_RE.match(ctype)
221
raise errors.InvalidHttpContentType(path, ctype,
222
"Expected multipart/byteranges with boundary")
224
boundary = match.group(2)
225
# mutter('multipart boundary is %s', boundary)
226
pattern = HttpMultipartRangeResponse._BOUNDARY_PATT
227
return re.compile(pattern % re.escape(boundary),
228
re.IGNORECASE | re.MULTILINE)
231
def _is_multipart(content_type):
232
return content_type.startswith('multipart/byteranges;')
235
def handle_response(url, code, headers, data):
236
"""Interpret the code & headers and return a HTTP response.
238
This is a factory method which returns an appropriate HTTP response
239
based on the code & headers it's given.
348
def handle_response(url, code, msg, data):
349
"""Interpret the code & headers and wrap the provided data in a RangeFile.
351
This is a factory method which returns an appropriate RangeFile based on
352
the code & headers it's given.
241
354
:param url: The url being processed. Mostly for error reporting
242
355
:param code: The integer HTTP response code
243
:param headers: A dict-like object that contains the HTTP response headers
356
:param msg: An HTTPMessage containing the headers for the response
244
357
:param data: A file-like object that can be read() to get the
246
:return: A file-like object that can seek()+read() the
359
:return: A file-like object that can seek()+read() the
247
360
ranges indicated by the headers.
252
content_type = headers['Content-Type']
254
# When there is no content-type header we treat
255
# the response as being of type 'application/octet-stream' as per
256
# RFC2616 section 7.2.1.
364
rfile = ResponseFile(url, data)
366
rfile = RangeFile(url, data)
367
content_type = msg.getheader('content-type', None)
368
if content_type is None:
369
# When there is no content-type header we treat the response as
370
# being of type 'application/octet-stream' as per RFC2616 section
257
372
# Therefore it is obviously not multipart
258
373
content_type = 'application/octet-stream'
259
374
is_multipart = False
261
is_multipart = _is_multipart(content_type)
376
is_multipart = (msg.getmaintype() == 'multipart'
377
and msg.getsubtype() == 'byteranges')
264
380
# Full fledged multipart response
265
return HttpMultipartRangeResponse(url, content_type, data)
381
rfile.set_boundary(msg.getparam('boundary'))
267
383
# A response to a range request, but not multipart
269
content_range = headers['Content-Range']
384
content_range = msg.getheader('content-range', None)
385
if content_range is None:
271
386
raise errors.InvalidHttpResponse(url,
272
387
'Missing the Content-Range header in a 206 range response')
273
return HttpRangeResponse(url, content_range, data)
275
# A regular non-range response, unfortunately the result from
276
# urllib doesn't support seek, so we wrap it in a StringIO
277
tell = getattr(data, 'tell', None)
279
return StringIO(data.read())
282
raise errors.NoSuchFile(url)
284
# TODO: jam 20060713 Properly handle redirects (302 Found, etc)
285
# The '_get' code says to follow redirects, we probably
286
# should actually handle the return values
388
rfile.set_range_from_header(content_range)
288
raise errors.InvalidHttpResponse(url, "Unknown response code %s"
390
raise errors.InvalidHttpResponse(url,
391
'Unknown response code %s' % code)