~bzr-pqm/bzr/bzr.dev

2052.3.1 by John Arbash Meinel
Add tests to cleanup the copyright of all source files
1
# Copyright (C) 2006 Canonical Ltd
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Handlers for HTTP Responses.
18
19
The purpose of these classes is to provide a uniform interface for clients
20
to standard HTTP responses, single range responses and multipart range
21
responses.
22
"""
23
24
25
from bisect import bisect
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
26
from cStringIO import StringIO
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
27
import re
28
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
29
from bzrlib import errors
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
30
from bzrlib.trace import mutter
31
32
33
class ResponseRange(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
34
    """A range in a RangeFile-object."""
35
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
36
    __slots__ = ['_ent_start', '_ent_end', '_data_start']
37
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
38
    def __init__(self, ent_start, ent_end, data_start):
39
        self._ent_start = ent_start
40
        self._ent_end = ent_end
41
        self._data_start = data_start
42
43
    def __cmp__(self, other):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
44
        """Compare this to other.
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
45
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
46
        We need this both for sorting, and so that we can
47
        bisect the list of ranges.
48
        """
49
        if isinstance(other, int):
50
            # Later on we bisect for a starting point
51
            # so we allow comparing against a single integer
52
            return cmp(self._ent_start, other)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
53
        else:
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
54
            return cmp((self._ent_start, self._ent_end, self._data_start),
55
                       (other._ent_start, other._ent_end, other._data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
56
57
    def __str__(self):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
58
        return "%s(%s-%s,%s)" % (self.__class__.__name__,
59
                                 self._ent_start, self._ent_end,
60
                                 self._data_start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
61
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
62
    __repr__ = __str__
63
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
64
65
class RangeFile(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
66
    """File-like object that allow access to partial available data.
67
68
    Specified by a set of ranges.
69
    """
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
70
71
    def __init__(self, path, input_file):
72
        self._path = path
73
        self._pos = 0
74
        self._len = 0
75
        self._ranges = []
76
        self._data = input_file.read()
77
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
78
    def _add_range(self, ent_start, ent_end, data_start):
79
        """Add an entity range.
80
81
        :param ent_start: Start offset of entity
82
        :param ent_end: End offset of entity (inclusive)
83
        :param data_start: Start offset of data in data stream.
84
        """
85
        self._ranges.append(ResponseRange(ent_start, ent_end, data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
86
        self._len = max(self._len, ent_end)
87
88
    def _finish_ranges(self):
89
        self._ranges.sort()
90
91
    def read(self, size):
92
        """Read size bytes from the current position in the file.
93
94
        Reading across ranges is not supported.
95
        """
96
        # find the last range which has a start <= pos
97
        i = bisect(self._ranges, self._pos) - 1
98
99
        if i < 0 or self._pos > self._ranges[i]._ent_end:
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
100
            mutter('Bisect for pos: %s failed. Found offset: %d, ranges:%s',
101
                   self._pos, i, self._ranges)
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
102
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
103
104
        r = self._ranges[i]
105
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
106
        # mutter('found range %s %s for pos %s', i, self._ranges[i], self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
107
108
        if (self._pos + size - 1) > r._ent_end:
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
109
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
110
111
        start = r._data_start + (self._pos - r._ent_start)
112
        end   = start + size
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
113
        # mutter("range read %d bytes at %d == %d-%d", size, self._pos,
114
        #         start, end)
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
115
        self._pos += (end-start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
116
        return self._data[start:end]
117
118
    def seek(self, offset, whence=0):
119
        if whence == 0:
120
            self._pos = offset
121
        elif whence == 1:
122
            self._pos += offset
123
        elif whence == 2:
124
            self._pos = self._len + offset
125
        else:
126
            raise ValueError("Invalid value %s for whence." % whence)
127
128
        if self._pos < 0:
129
            self._pos = 0
130
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
131
    def tell(self):
132
        return self._pos
133
1786.1.5 by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object.
134
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
135
class HttpRangeResponse(RangeFile):
136
    """A single-range HTTP response."""
137
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
138
    # TODO: jam 20060706 Consider compiling these regexes on demand
139
    _CONTENT_RANGE_RE = re.compile(
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
140
        r'\s*([^\s]+)\s+([0-9]+)-([0-9]+)/([0-9]+)\s*$')
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
141
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
142
    def __init__(self, path, content_range, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
143
        # mutter("parsing 206 non-multipart response for %s", path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
144
        RangeFile.__init__(self, path, input_file)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
145
        start, end = self._parse_range(content_range, path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
146
        self._add_range(start, end, 0)
147
        self._finish_ranges()
148
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
149
    @staticmethod
150
    def _parse_range(range, path='<unknown>'):
151
        """Parse an http Content-range header and return start + end
152
153
        :param range: The value for Content-range
154
        :param path: Provide to give better error messages.
155
        :return: (start, end) A tuple of integers
156
        """
157
        match = HttpRangeResponse._CONTENT_RANGE_RE.match(range)
158
        if not match:
159
            raise errors.InvalidHttpRange(path, range,
160
                                          "Invalid Content-range")
161
162
        rtype, start, end, total = match.groups()
163
164
        if rtype != 'bytes':
165
            raise errors.InvalidHttpRange(path, range,
166
                    "Unsupported range type '%s'" % (rtype,))
167
168
        try:
169
            start = int(start)
170
            end = int(end)
171
        except ValueError, e:
172
            raise errors.InvalidHttpRange(path, range, str(e))
173
174
        return start, end
1786.1.16 by John Arbash Meinel
Refactor tests
175
176
177
class HttpMultipartRangeResponse(RangeFile):
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
178
    """A multi-range HTTP response."""
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
179
    
180
    _CONTENT_TYPE_RE = re.compile(
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
181
        r'^\s*multipart/byteranges\s*;\s*boundary\s*=\s*("?)([^"]*?)\1\s*$')
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
182
    
183
    # Start with --<boundary>\r\n
184
    # and ignore all headers ending in \r\n
185
    # except for content-range:
186
    # and find the two trailing \r\n separators
187
    # indicating the start of the text
188
    # TODO: jam 20060706 This requires exact conformance
189
    #       to the spec, we probably could relax the requirement
190
    #       of \r\n, and use something more like (\r?\n)
191
    _BOUNDARY_PATT = (
192
        "^--%s(?:\r\n(?:(?:content-range:([^\r]+))|[^\r]+))+\r\n\r\n")
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
193
194
    def __init__(self, path, content_type, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
195
        # mutter("parsing 206 multipart response for %s", path)
1786.1.14 by John Arbash Meinel
Testing basic functionality of HttpMultipartRangeResponse
196
        # TODO: jam 20060706 Is it valid to initialize a
197
        #       grandparent without initializing parent?
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
198
        RangeFile.__init__(self, path, input_file)
199
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
200
        self.boundary_regex = self._parse_boundary(content_type, path)
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
201
        # mutter('response:\n%r', self._data)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
202
1786.1.18 by John Arbash Meinel
Add tests for HttpMultiRangeResponse
203
        for match in self.boundary_regex.finditer(self._data):
1786.1.33 by John Arbash Meinel
Cleanup pass #2
204
            ent_start, ent_end = HttpRangeResponse._parse_range(match.group(1),
205
                                                                path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
206
            self._add_range(ent_start, ent_end, match.end())
207
208
        self._finish_ranges()
209
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
210
    @staticmethod
211
    def _parse_boundary(ctype, path='<unknown>'):
212
        """Parse the Content-type field.
213
        
214
        This expects a multipart Content-type, and returns a
215
        regex which is capable of finding the boundaries
216
        in the multipart data.
217
        """
218
        match = HttpMultipartRangeResponse._CONTENT_TYPE_RE.match(ctype)
219
        if not match:
220
            raise errors.InvalidHttpContentType(path, ctype,
221
                    "Expected multipart/byteranges with boundary")
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
222
1979.1.2 by John Arbash Meinel
Use the regex, rather than stripping off the boundary later.
223
        boundary = match.group(2)
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
224
        # mutter('multipart boundary is %s', boundary)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
225
        pattern = HttpMultipartRangeResponse._BOUNDARY_PATT
226
        return re.compile(pattern % re.escape(boundary),
227
                          re.IGNORECASE | re.MULTILINE)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
228
229
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
230
def _is_multipart(content_type):
231
    return content_type.startswith('multipart/byteranges;')
232
233
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
234
def handle_response(url, code, headers, data):
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
235
    """Interpret the code & headers and return a HTTP response.
236
237
    This is a factory method which returns an appropriate HTTP response
238
    based on the code & headers it's given.
239
240
    :param url: The url being processed. Mostly for error reporting
241
    :param code: The integer HTTP response code
242
    :param headers: A dict-like object that contains the HTTP response headers
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
243
    :param data: A file-like object that can be read() to get the
244
                 requested data
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
245
    :return: A file-like object that can seek()+read() the 
246
             ranges indicated by the headers.
247
    """
1786.1.26 by John Arbash Meinel
Update and test handle_response.
248
249
    if code == 206:
250
        try:
251
            content_type = headers['Content-Type']
252
        except KeyError:
2073.1.1 by John Arbash Meinel
Robert's comments: Refer to RFC2616 to explain how we handle missing Content-Type
253
            # When there is no content-type header we treat
254
            # the response as being of type 'application/octet-stream' as per
255
            # RFC2616 section 7.2.1.
256
            # Therefore it is obviously not multipart
257
            content_type = 'application/octet-stream'
2070.1.1 by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses
258
            is_multipart = False
259
        else:
260
            is_multipart = _is_multipart(content_type)
1786.1.26 by John Arbash Meinel
Update and test handle_response.
261
2070.1.1 by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses
262
        if is_multipart:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
263
            # Full fledged multipart response
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
264
            return HttpMultipartRangeResponse(url, content_type, data)
1786.1.26 by John Arbash Meinel
Update and test handle_response.
265
        else:
266
            # A response to a range request, but not multipart
267
            try:
268
                content_range = headers['Content-Range']
269
            except KeyError:
270
                raise errors.InvalidHttpResponse(url,
271
                    'Missing the Content-Range header in a 206 range response')
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
272
            return HttpRangeResponse(url, content_range, data)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
273
    elif code == 200:
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
274
        # A regular non-range response, unfortunately the result from
275
        # urllib doesn't support seek, so we wrap it in a StringIO
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
276
        tell = getattr(data, 'tell', None)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
277
        if tell is None:
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
278
            return StringIO(data.read())
279
        return data
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
280
    elif code == 404:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
281
        raise errors.NoSuchFile(url)
2000.3.9 by v.ladeuil+lp at free
The tests that would have help avoid bug #73948 and all that mess :)
282
    elif code == 416:
2172.3.2 by v.ladeuil+lp at free
Fix the missing import and typos in comments.
283
        # We don't know which, but one of the ranges we specified
284
        # was wrong. So we raise with 0 for a lack of a better
285
        # magic value.
2000.3.9 by v.ladeuil+lp at free
The tests that would have help avoid bug #73948 and all that mess :)
286
        raise errors.InvalidRange(url,0)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
287
1786.1.33 by John Arbash Meinel
Cleanup pass #2
288
    # TODO: jam 20060713 Properly handle redirects (302 Found, etc)
289
    #       The '_get' code says to follow redirects, we probably 
290
    #       should actually handle the return values
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
291
    else:
292
        raise errors.InvalidHttpResponse(url, "Unknown response code %s" 
293
                                              % (code,))
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
294