1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
1 |
# Copyright (C) 2006 Michael Ellerman
|
1786.1.33
by John Arbash Meinel
Cleanup pass #2 |
2 |
# modified by John Arbash Meinel (Canonical Ltd)
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
3 |
#
|
4 |
# This program is free software; you can redistribute it and/or modify
|
|
5 |
# it under the terms of the GNU General Public License as published by
|
|
6 |
# the Free Software Foundation; either version 2 of the License, or
|
|
7 |
# (at your option) any later version.
|
|
8 |
#
|
|
9 |
# This program is distributed in the hope that it will be useful,
|
|
10 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 |
# GNU General Public License for more details.
|
|
13 |
#
|
|
14 |
# You should have received a copy of the GNU General Public License
|
|
15 |
# along with this program; if not, write to the Free Software
|
|
16 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
17 |
||
18 |
"""Handlers for HTTP Responses.
|
|
19 |
||
20 |
The purpose of these classes is to provide a uniform interface for clients
|
|
21 |
to standard HTTP responses, single range responses and multipart range
|
|
22 |
responses.
|
|
23 |
"""
|
|
24 |
||
25 |
||
26 |
from bisect import bisect |
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
27 |
from cStringIO import StringIO |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
28 |
import re |
29 |
||
1786.1.13
by John Arbash Meinel
Found a few bugs in error handling code, updated tests |
30 |
from bzrlib import errors |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
31 |
from bzrlib.trace import mutter |
32 |
||
33 |
||
34 |
class ResponseRange(object): |
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
35 |
"""A range in a RangeFile-object."""
|
36 |
||
1786.1.12
by John Arbash Meinel
Add tests for ResponseRange and streamline class |
37 |
__slots__ = ['_ent_start', '_ent_end', '_data_start'] |
38 |
||
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
39 |
def __init__(self, ent_start, ent_end, data_start): |
40 |
self._ent_start = ent_start |
|
41 |
self._ent_end = ent_end |
|
42 |
self._data_start = data_start |
|
43 |
||
44 |
def __cmp__(self, other): |
|
1786.1.12
by John Arbash Meinel
Add tests for ResponseRange and streamline class |
45 |
"""Compare this to other.
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
46 |
|
1786.1.12
by John Arbash Meinel
Add tests for ResponseRange and streamline class |
47 |
We need this both for sorting, and so that we can
|
48 |
bisect the list of ranges.
|
|
49 |
"""
|
|
50 |
if isinstance(other, int): |
|
51 |
# Later on we bisect for a starting point
|
|
52 |
# so we allow comparing against a single integer
|
|
53 |
return cmp(self._ent_start, other) |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
54 |
else: |
1786.1.12
by John Arbash Meinel
Add tests for ResponseRange and streamline class |
55 |
return cmp((self._ent_start, self._ent_end, self._data_start), |
56 |
(other._ent_start, other._ent_end, other._data_start)) |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
57 |
|
58 |
def __str__(self): |
|
1786.1.12
by John Arbash Meinel
Add tests for ResponseRange and streamline class |
59 |
return "%s(%s-%s,%s)" % (self.__class__.__name__, |
60 |
self._ent_start, self._ent_end, |
|
61 |
self._data_start) |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
62 |
|
1979.1.1
by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it |
63 |
__repr__ = __str__ |
64 |
||
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
65 |
|
66 |
class RangeFile(object): |
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
67 |
"""File-like object that allow access to partial available data.
|
68 |
||
69 |
Specified by a set of ranges.
|
|
70 |
"""
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
71 |
|
72 |
def __init__(self, path, input_file): |
|
73 |
self._path = path |
|
74 |
self._pos = 0 |
|
75 |
self._len = 0 |
|
76 |
self._ranges = [] |
|
77 |
self._data = input_file.read() |
|
78 |
||
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
79 |
def _add_range(self, ent_start, ent_end, data_start): |
80 |
"""Add an entity range.
|
|
81 |
||
82 |
:param ent_start: Start offset of entity
|
|
83 |
:param ent_end: End offset of entity (inclusive)
|
|
84 |
:param data_start: Start offset of data in data stream.
|
|
85 |
"""
|
|
86 |
self._ranges.append(ResponseRange(ent_start, ent_end, data_start)) |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
87 |
self._len = max(self._len, ent_end) |
88 |
||
89 |
def _finish_ranges(self): |
|
90 |
self._ranges.sort() |
|
91 |
||
92 |
def read(self, size): |
|
93 |
"""Read size bytes from the current position in the file.
|
|
94 |
||
95 |
Reading across ranges is not supported.
|
|
96 |
"""
|
|
97 |
# find the last range which has a start <= pos
|
|
98 |
i = bisect(self._ranges, self._pos) - 1 |
|
99 |
||
100 |
if i < 0 or self._pos > self._ranges[i]._ent_end: |
|
1979.1.1
by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it |
101 |
mutter('Bisect for pos: %s failed. Found offset: %d, ranges:%s', |
102 |
self._pos, i, self._ranges) |
|
1786.1.13
by John Arbash Meinel
Found a few bugs in error handling code, updated tests |
103 |
raise errors.InvalidRange(self._path, self._pos) |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
104 |
|
105 |
r = self._ranges[i] |
|
106 |
||
1786.1.32
by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports. |
107 |
# mutter('found range %s %s for pos %s', i, self._ranges[i], self._pos)
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
108 |
|
109 |
if (self._pos + size - 1) > r._ent_end: |
|
1786.1.13
by John Arbash Meinel
Found a few bugs in error handling code, updated tests |
110 |
raise errors.InvalidRange(self._path, self._pos) |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
111 |
|
112 |
start = r._data_start + (self._pos - r._ent_start) |
|
113 |
end = start + size |
|
1786.1.32
by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports. |
114 |
# mutter("range read %d bytes at %d == %d-%d", size, self._pos,
|
115 |
# start, end)
|
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
116 |
self._pos += (end-start) |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
117 |
return self._data[start:end] |
118 |
||
119 |
def seek(self, offset, whence=0): |
|
120 |
if whence == 0: |
|
121 |
self._pos = offset |
|
122 |
elif whence == 1: |
|
123 |
self._pos += offset |
|
124 |
elif whence == 2: |
|
125 |
self._pos = self._len + offset |
|
126 |
else: |
|
127 |
raise ValueError("Invalid value %s for whence." % whence) |
|
128 |
||
129 |
if self._pos < 0: |
|
130 |
self._pos = 0 |
|
131 |
||
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
132 |
def tell(self): |
133 |
return self._pos |
|
134 |
||
1786.1.5
by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object. |
135 |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
136 |
class HttpRangeResponse(RangeFile): |
137 |
"""A single-range HTTP response."""
|
|
138 |
||
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
139 |
# TODO: jam 20060706 Consider compiling these regexes on demand
|
140 |
_CONTENT_RANGE_RE = re.compile( |
|
1979.1.2
by John Arbash Meinel
Use the regex, rather than stripping off the boundary later. |
141 |
r'\s*([^\s]+)\s+([0-9]+)-([0-9]+)/([0-9]+)\s*$') |
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
142 |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
143 |
def __init__(self, path, content_range, input_file): |
1786.1.32
by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports. |
144 |
# mutter("parsing 206 non-multipart response for %s", path)
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
145 |
RangeFile.__init__(self, path, input_file) |
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
146 |
start, end = self._parse_range(content_range, path) |
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
147 |
self._add_range(start, end, 0) |
148 |
self._finish_ranges() |
|
149 |
||
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
150 |
@staticmethod
|
151 |
def _parse_range(range, path='<unknown>'): |
|
152 |
"""Parse an http Content-range header and return start + end
|
|
153 |
||
154 |
:param range: The value for Content-range
|
|
155 |
:param path: Provide to give better error messages.
|
|
156 |
:return: (start, end) A tuple of integers
|
|
157 |
"""
|
|
158 |
match = HttpRangeResponse._CONTENT_RANGE_RE.match(range) |
|
159 |
if not match: |
|
160 |
raise errors.InvalidHttpRange(path, range, |
|
161 |
"Invalid Content-range") |
|
162 |
||
163 |
rtype, start, end, total = match.groups() |
|
164 |
||
165 |
if rtype != 'bytes': |
|
166 |
raise errors.InvalidHttpRange(path, range, |
|
167 |
"Unsupported range type '%s'" % (rtype,)) |
|
168 |
||
169 |
try: |
|
170 |
start = int(start) |
|
171 |
end = int(end) |
|
172 |
except ValueError, e: |
|
173 |
raise errors.InvalidHttpRange(path, range, str(e)) |
|
174 |
||
175 |
return start, end |
|
1786.1.16
by John Arbash Meinel
Refactor tests |
176 |
|
177 |
||
178 |
class HttpMultipartRangeResponse(RangeFile): |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
179 |
"""A multi-range HTTP response."""
|
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
180 |
|
181 |
_CONTENT_TYPE_RE = re.compile( |
|
1979.1.2
by John Arbash Meinel
Use the regex, rather than stripping off the boundary later. |
182 |
r'^\s*multipart/byteranges\s*;\s*boundary\s*=\s*("?)([^"]*?)\1\s*$') |
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
183 |
|
184 |
# Start with --<boundary>\r\n
|
|
185 |
# and ignore all headers ending in \r\n
|
|
186 |
# except for content-range:
|
|
187 |
# and find the two trailing \r\n separators
|
|
188 |
# indicating the start of the text
|
|
189 |
# TODO: jam 20060706 This requires exact conformance
|
|
190 |
# to the spec, we probably could relax the requirement
|
|
191 |
# of \r\n, and use something more like (\r?\n)
|
|
192 |
_BOUNDARY_PATT = ( |
|
193 |
"^--%s(?:\r\n(?:(?:content-range:([^\r]+))|[^\r]+))+\r\n\r\n") |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
194 |
|
195 |
def __init__(self, path, content_type, input_file): |
|
1786.1.32
by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports. |
196 |
# mutter("parsing 206 multipart response for %s", path)
|
1786.1.14
by John Arbash Meinel
Testing basic functionality of HttpMultipartRangeResponse |
197 |
# TODO: jam 20060706 Is it valid to initialize a
|
198 |
# grandparent without initializing parent?
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
199 |
RangeFile.__init__(self, path, input_file) |
200 |
||
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
201 |
self.boundary_regex = self._parse_boundary(content_type, path) |
1979.1.1
by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it |
202 |
# mutter('response:\n%r', self._data)
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
203 |
|
1786.1.18
by John Arbash Meinel
Add tests for HttpMultiRangeResponse |
204 |
for match in self.boundary_regex.finditer(self._data): |
1786.1.33
by John Arbash Meinel
Cleanup pass #2 |
205 |
ent_start, ent_end = HttpRangeResponse._parse_range(match.group(1), |
206 |
path) |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
207 |
self._add_range(ent_start, ent_end, match.end()) |
208 |
||
209 |
self._finish_ranges() |
|
210 |
||
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
211 |
@staticmethod
|
212 |
def _parse_boundary(ctype, path='<unknown>'): |
|
213 |
"""Parse the Content-type field.
|
|
214 |
|
|
215 |
This expects a multipart Content-type, and returns a
|
|
216 |
regex which is capable of finding the boundaries
|
|
217 |
in the multipart data.
|
|
218 |
"""
|
|
219 |
match = HttpMultipartRangeResponse._CONTENT_TYPE_RE.match(ctype) |
|
220 |
if not match: |
|
221 |
raise errors.InvalidHttpContentType(path, ctype, |
|
222 |
"Expected multipart/byteranges with boundary") |
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
223 |
|
1979.1.2
by John Arbash Meinel
Use the regex, rather than stripping off the boundary later. |
224 |
boundary = match.group(2) |
1786.1.32
by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports. |
225 |
# mutter('multipart boundary is %s', boundary)
|
1786.1.24
by John Arbash Meinel
Move the functions/regexes to be static members |
226 |
pattern = HttpMultipartRangeResponse._BOUNDARY_PATT |
227 |
return re.compile(pattern % re.escape(boundary), |
|
228 |
re.IGNORECASE | re.MULTILINE) |
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
229 |
|
230 |
||
1786.1.25
by John Arbash Meinel
Test that we can extract headers properly. |
231 |
def _is_multipart(content_type): |
232 |
return content_type.startswith('multipart/byteranges;') |
|
233 |
||
234 |
||
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
235 |
def handle_response(url, code, headers, data): |
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
236 |
"""Interpret the code & headers and return a HTTP response.
|
237 |
||
238 |
This is a factory method which returns an appropriate HTTP response
|
|
239 |
based on the code & headers it's given.
|
|
240 |
||
241 |
:param url: The url being processed. Mostly for error reporting
|
|
242 |
:param code: The integer HTTP response code
|
|
243 |
:param headers: A dict-like object that contains the HTTP response headers
|
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
244 |
:param data: A file-like object that can be read() to get the
|
245 |
requested data
|
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
246 |
:return: A file-like object that can seek()+read() the
|
247 |
ranges indicated by the headers.
|
|
248 |
"""
|
|
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
249 |
|
250 |
if code == 206: |
|
251 |
try: |
|
252 |
content_type = headers['Content-Type'] |
|
253 |
except KeyError: |
|
254 |
raise errors.InvalidHttpContentType(url, '', |
|
1786.1.40
by John Arbash Meinel
code cleanups from Martin Pool. |
255 |
msg='Missing Content-Type') |
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
256 |
|
257 |
if _is_multipart(content_type): |
|
258 |
# Full fledged multipart response
|
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
259 |
return HttpMultipartRangeResponse(url, content_type, data) |
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
260 |
else: |
261 |
# A response to a range request, but not multipart
|
|
262 |
try: |
|
263 |
content_range = headers['Content-Range'] |
|
264 |
except KeyError: |
|
265 |
raise errors.InvalidHttpResponse(url, |
|
266 |
'Missing the Content-Range header in a 206 range response') |
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
267 |
return HttpRangeResponse(url, content_range, data) |
1786.1.25
by John Arbash Meinel
Test that we can extract headers properly. |
268 |
elif code == 200: |
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
269 |
# A regular non-range response, unfortunately the result from
|
270 |
# urllib doesn't support seek, so we wrap it in a StringIO
|
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
271 |
tell = getattr(data, 'tell', None) |
1786.1.25
by John Arbash Meinel
Test that we can extract headers properly. |
272 |
if tell is None: |
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
273 |
return StringIO(data.read()) |
274 |
return data |
|
1786.1.25
by John Arbash Meinel
Test that we can extract headers properly. |
275 |
elif code == 404: |
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
276 |
raise errors.NoSuchFile(url) |
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
277 |
|
1786.1.33
by John Arbash Meinel
Cleanup pass #2 |
278 |
# TODO: jam 20060713 Properly handle redirects (302 Found, etc)
|
279 |
# The '_get' code says to follow redirects, we probably
|
|
280 |
# should actually handle the return values
|
|
1786.1.40
by John Arbash Meinel
code cleanups from Martin Pool. |
281 |
else: |
282 |
raise errors.InvalidHttpResponse(url, "Unknown response code %s" |
|
283 |
% (code,)) |
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
284 |