~bzr-pqm/bzr/bzr.dev

1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
1
# Copyright (C) 2006 Michael Ellerman
1786.1.33 by John Arbash Meinel
Cleanup pass #2
2
#           modified by John Arbash Meinel (Canonical Ltd)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
18
"""Handlers for HTTP Responses.
19
20
The purpose of these classes is to provide a uniform interface for clients
21
to standard HTTP responses, single range responses and multipart range
22
responses.
23
"""
24
25
26
from bisect import bisect
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
27
from cStringIO import StringIO
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
28
import re
29
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
30
from bzrlib import errors
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
31
from bzrlib.trace import mutter
32
33
34
class ResponseRange(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
35
    """A range in a RangeFile-object."""
36
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
37
    __slots__ = ['_ent_start', '_ent_end', '_data_start']
38
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
39
    def __init__(self, ent_start, ent_end, data_start):
40
        self._ent_start = ent_start
41
        self._ent_end = ent_end
42
        self._data_start = data_start
43
44
    def __cmp__(self, other):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
45
        """Compare this to other.
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
46
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
47
        We need this both for sorting, and so that we can
48
        bisect the list of ranges.
49
        """
50
        if isinstance(other, int):
51
            # Later on we bisect for a starting point
52
            # so we allow comparing against a single integer
53
            return cmp(self._ent_start, other)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
54
        else:
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
55
            return cmp((self._ent_start, self._ent_end, self._data_start),
56
                       (other._ent_start, other._ent_end, other._data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
57
58
    def __str__(self):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
59
        return "%s(%s-%s,%s)" % (self.__class__.__name__,
60
                                 self._ent_start, self._ent_end,
61
                                 self._data_start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
62
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
63
    __repr__ = __str__
64
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
65
66
class RangeFile(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
67
    """File-like object that allow access to partial available data.
68
69
    Specified by a set of ranges.
70
    """
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
71
72
    def __init__(self, path, input_file):
73
        self._path = path
74
        self._pos = 0
75
        self._len = 0
76
        self._ranges = []
77
        self._data = input_file.read()
78
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
79
    def _add_range(self, ent_start, ent_end, data_start):
80
        """Add an entity range.
81
82
        :param ent_start: Start offset of entity
83
        :param ent_end: End offset of entity (inclusive)
84
        :param data_start: Start offset of data in data stream.
85
        """
86
        self._ranges.append(ResponseRange(ent_start, ent_end, data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
87
        self._len = max(self._len, ent_end)
88
89
    def _finish_ranges(self):
90
        self._ranges.sort()
91
92
    def read(self, size):
93
        """Read size bytes from the current position in the file.
94
95
        Reading across ranges is not supported.
96
        """
97
        # find the last range which has a start <= pos
98
        i = bisect(self._ranges, self._pos) - 1
99
100
        if i < 0 or self._pos > self._ranges[i]._ent_end:
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
101
            mutter('Bisect for pos: %s failed. Found offset: %d, ranges:%s',
102
                   self._pos, i, self._ranges)
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
103
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
104
105
        r = self._ranges[i]
106
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
107
        # mutter('found range %s %s for pos %s', i, self._ranges[i], self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
108
109
        if (self._pos + size - 1) > r._ent_end:
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
110
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
111
112
        start = r._data_start + (self._pos - r._ent_start)
113
        end   = start + size
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
114
        # mutter("range read %d bytes at %d == %d-%d", size, self._pos,
115
        #         start, end)
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
116
        self._pos += (end-start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
117
        return self._data[start:end]
118
119
    def seek(self, offset, whence=0):
120
        if whence == 0:
121
            self._pos = offset
122
        elif whence == 1:
123
            self._pos += offset
124
        elif whence == 2:
125
            self._pos = self._len + offset
126
        else:
127
            raise ValueError("Invalid value %s for whence." % whence)
128
129
        if self._pos < 0:
130
            self._pos = 0
131
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
132
    def tell(self):
133
        return self._pos
134
1786.1.5 by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object.
135
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
136
class HttpRangeResponse(RangeFile):
137
    """A single-range HTTP response."""
138
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
139
    # TODO: jam 20060706 Consider compiling these regexes on demand
140
    _CONTENT_RANGE_RE = re.compile(
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
141
        r'\s*([^\s]+)\s+([0-9]+)-([0-9]+)/([0-9]+)\s*$')
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
142
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
143
    def __init__(self, path, content_range, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
144
        # mutter("parsing 206 non-multipart response for %s", path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
145
        RangeFile.__init__(self, path, input_file)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
146
        start, end = self._parse_range(content_range, path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
147
        self._add_range(start, end, 0)
148
        self._finish_ranges()
149
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
150
    @staticmethod
151
    def _parse_range(range, path='<unknown>'):
152
        """Parse an http Content-range header and return start + end
153
154
        :param range: The value for Content-range
155
        :param path: Provide to give better error messages.
156
        :return: (start, end) A tuple of integers
157
        """
158
        match = HttpRangeResponse._CONTENT_RANGE_RE.match(range)
159
        if not match:
160
            raise errors.InvalidHttpRange(path, range,
161
                                          "Invalid Content-range")
162
163
        rtype, start, end, total = match.groups()
164
165
        if rtype != 'bytes':
166
            raise errors.InvalidHttpRange(path, range,
167
                    "Unsupported range type '%s'" % (rtype,))
168
169
        try:
170
            start = int(start)
171
            end = int(end)
172
        except ValueError, e:
173
            raise errors.InvalidHttpRange(path, range, str(e))
174
175
        return start, end
1786.1.16 by John Arbash Meinel
Refactor tests
176
177
178
class HttpMultipartRangeResponse(RangeFile):
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
179
    """A multi-range HTTP response."""
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
180
    
181
    _CONTENT_TYPE_RE = re.compile(
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
182
        r'^\s*multipart/byteranges\s*;\s*boundary\s*=\s*("?)([^"]*?)\1\s*$')
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
183
    
184
    # Start with --<boundary>\r\n
185
    # and ignore all headers ending in \r\n
186
    # except for content-range:
187
    # and find the two trailing \r\n separators
188
    # indicating the start of the text
189
    # TODO: jam 20060706 This requires exact conformance
190
    #       to the spec, we probably could relax the requirement
191
    #       of \r\n, and use something more like (\r?\n)
192
    _BOUNDARY_PATT = (
193
        "^--%s(?:\r\n(?:(?:content-range:([^\r]+))|[^\r]+))+\r\n\r\n")
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
194
195
    def __init__(self, path, content_type, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
196
        # mutter("parsing 206 multipart response for %s", path)
1786.1.14 by John Arbash Meinel
Testing basic functionality of HttpMultipartRangeResponse
197
        # TODO: jam 20060706 Is it valid to initialize a
198
        #       grandparent without initializing parent?
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
199
        RangeFile.__init__(self, path, input_file)
200
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
201
        self.boundary_regex = self._parse_boundary(content_type, path)
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
202
        # mutter('response:\n%r', self._data)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
203
1786.1.18 by John Arbash Meinel
Add tests for HttpMultiRangeResponse
204
        for match in self.boundary_regex.finditer(self._data):
1786.1.33 by John Arbash Meinel
Cleanup pass #2
205
            ent_start, ent_end = HttpRangeResponse._parse_range(match.group(1),
206
                                                                path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
207
            self._add_range(ent_start, ent_end, match.end())
208
209
        self._finish_ranges()
210
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
211
    @staticmethod
212
    def _parse_boundary(ctype, path='<unknown>'):
213
        """Parse the Content-type field.
214
        
215
        This expects a multipart Content-type, and returns a
216
        regex which is capable of finding the boundaries
217
        in the multipart data.
218
        """
219
        match = HttpMultipartRangeResponse._CONTENT_TYPE_RE.match(ctype)
220
        if not match:
221
            raise errors.InvalidHttpContentType(path, ctype,
222
                    "Expected multipart/byteranges with boundary")
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
223
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
224
        boundary = match.group(2)
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
225
        # mutter('multipart boundary is %s', boundary)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
226
        pattern = HttpMultipartRangeResponse._BOUNDARY_PATT
227
        return re.compile(pattern % re.escape(boundary),
228
                          re.IGNORECASE | re.MULTILINE)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
229
230
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
231
def _is_multipart(content_type):
232
    return content_type.startswith('multipart/byteranges;')
233
234
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
235
def handle_response(url, code, headers, data):
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
236
    """Interpret the code & headers and return a HTTP response.
237
238
    This is a factory method which returns an appropriate HTTP response
239
    based on the code & headers it's given.
240
241
    :param url: The url being processed. Mostly for error reporting
242
    :param code: The integer HTTP response code
243
    :param headers: A dict-like object that contains the HTTP response headers
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
244
    :param data: A file-like object that can be read() to get the
245
                 requested data
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
246
    :return: A file-like object that can seek()+read() the 
247
             ranges indicated by the headers.
248
    """
1786.1.26 by John Arbash Meinel
Update and test handle_response.
249
250
    if code == 206:
251
        try:
252
            content_type = headers['Content-Type']
253
        except KeyError:
254
            raise errors.InvalidHttpContentType(url, '',
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
255
                msg='Missing Content-Type')
1786.1.26 by John Arbash Meinel
Update and test handle_response.
256
257
        if _is_multipart(content_type):
258
            # Full fledged multipart response
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
259
            return HttpMultipartRangeResponse(url, content_type, data)
1786.1.26 by John Arbash Meinel
Update and test handle_response.
260
        else:
261
            # A response to a range request, but not multipart
262
            try:
263
                content_range = headers['Content-Range']
264
            except KeyError:
265
                raise errors.InvalidHttpResponse(url,
266
                    'Missing the Content-Range header in a 206 range response')
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
267
            return HttpRangeResponse(url, content_range, data)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
268
    elif code == 200:
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
269
        # A regular non-range response, unfortunately the result from
270
        # urllib doesn't support seek, so we wrap it in a StringIO
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
271
        tell = getattr(data, 'tell', None)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
272
        if tell is None:
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
273
            return StringIO(data.read())
274
        return data
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
275
    elif code == 404:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
276
        raise errors.NoSuchFile(url)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
277
1786.1.33 by John Arbash Meinel
Cleanup pass #2
278
    # TODO: jam 20060713 Properly handle redirects (302 Found, etc)
279
    #       The '_get' code says to follow redirects, we probably 
280
    #       should actually handle the return values
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
281
    else:
282
        raise errors.InvalidHttpResponse(url, "Unknown response code %s" 
283
                                              % (code,))
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
284