~bzr-pqm/bzr/bzr.dev

1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
1
# Copyright (C) 2006 Canonical Ltd
1540.3.18 by Martin Pool
Style review fixes (thanks robertc)
2
#
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
1540.3.18 by Martin Pool
Style review fixes (thanks robertc)
7
#
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
1540.3.18 by Martin Pool
Style review fixes (thanks robertc)
12
#
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""http/https transport using pycurl"""
18
19
# TODO: test reporting of http errors
1887.1.1 by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines,
20
#
1616.1.9 by Martin Pool
Set Cache-control: max-age=0 and Pragma: no-cache
21
# TODO: Transport option to control caching of particular requests; broadly we
22
# would want to offer "caching allowed" or "must revalidate", depending on
23
# whether we expect a particular file will be modified after it's committed.
24
# It's probably safer to just always revalidate.  mbp 20060321
25
1612.1.1 by Martin Pool
Raise errors correctly on pycurl connection failure
26
import os
1786.1.42 by John Arbash Meinel
Update _extract_headers, make it less generic, and non recursive.
27
from cStringIO import StringIO
1540.3.5 by Martin Pool
Raise exception if unicode is passed to transport; formatting fixes
28
2004.1.25 by v.ladeuil+lp at free
Shuffle http related test code. Hopefully it ends up at the right place :)
29
from bzrlib import (
30
    errors,
31
    __version__ as bzrlib_version,
32
    )
1540.3.15 by Martin Pool
[merge] large merge to sync with bzr.dev
33
import bzrlib
2004.1.25 by v.ladeuil+lp at free
Shuffle http related test code. Hopefully it ends up at the right place :)
34
from bzrlib.errors import (NoSuchFile,
35
                           ConnectionError,
1540.3.7 by Martin Pool
Prepare to select a transport depending on what dependencies can be satisfied.
36
                           DependencyNotPresent)
1540.3.18 by Martin Pool
Style review fixes (thanks robertc)
37
from bzrlib.trace import mutter
1636.1.2 by Robert Collins
More review fixen to the relpath at '/' fixes.
38
from bzrlib.transport import register_urlparse_netloc_protocol
2004.1.25 by v.ladeuil+lp at free
Shuffle http related test code. Hopefully it ends up at the right place :)
39
from bzrlib.transport.http import (
40
    _extract_headers,
41
    HttpTransportBase,
42
    _pycurl_errors,
43
    response,
44
    )
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
45
1540.3.7 by Martin Pool
Prepare to select a transport depending on what dependencies can be satisfied.
46
try:
47
    import pycurl
48
except ImportError, e:
49
    mutter("failed to import pycurl: %s", e)
50
    raise DependencyNotPresent('pycurl', e)
51
1684.1.5 by Martin Pool
(patch) check that pycurl will actuall initialize as well as load (Alexander)
52
try:
53
    # see if we can actually initialize PyCurl - sometimes it will load but
54
    # fail to start up due to this bug:
55
    #  
56
    #   32. (At least on Windows) If libcurl is built with c-ares and there's
57
    #   no DNS server configured in the system, the ares_init() call fails and
58
    #   thus curl_easy_init() fails as well. This causes weird effects for
59
    #   people who use numerical IP addresses only.
60
    #
61
    # reported by Alexander Belchenko, 2006-04-26
62
    pycurl.Curl()
63
except pycurl.error, e:
64
    mutter("failed to initialize pycurl: %s", e)
65
    raise DependencyNotPresent('pycurl', e)
66
1540.3.7 by Martin Pool
Prepare to select a transport depending on what dependencies can be satisfied.
67
1636.1.2 by Robert Collins
More review fixen to the relpath at '/' fixes.
68
register_urlparse_netloc_protocol('http+pycurl')
1636.1.1 by Robert Collins
Fix calling relpath() and abspath() on transports at their root.
69
70
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
71
class PyCurlTransport(HttpTransportBase):
1540.3.3 by Martin Pool
Review updates of pycurl transport
72
    """http client transport using pycurl
73
74
    PyCurl is a Python binding to the C "curl" multiprotocol client.
75
2004.1.30 by v.ladeuil+lp at free
Fix #62276 and #62029 by providing a more robust http range handling.
76
    This transport can be significantly faster than the builtin
77
    Python client.  Advantages include: DNS caching.
1540.3.3 by Martin Pool
Review updates of pycurl transport
78
    """
79
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
80
    def __init__(self, base, from_transport=None):
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
81
        super(PyCurlTransport, self).__init__(base)
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
82
        if from_transport is not None:
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
83
            self._curl = from_transport._curl
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
84
        else:
85
            mutter('using pycurl %s' % pycurl.version)
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
86
            self._curl = pycurl.Curl()
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
87
1540.3.10 by Martin Pool
[broken] keep hooking pycurl into test framework
88
    def should_cache(self):
89
        """Return True if the data pulled across should be cached locally.
90
        """
91
        return True
92
1540.3.3 by Martin Pool
Review updates of pycurl transport
93
    def has(self, relpath):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
94
        """See Transport.has()"""
95
        # We set NO BODY=0 in _get_full, so it should be safe
96
        # to re-use the non-range curl object
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
97
        curl = self._curl
1540.3.24 by Martin Pool
Add new protocol 'http+pycurl' that always uses PyCurl.
98
        abspath = self._real_abspath(relpath)
1540.3.14 by Martin Pool
[pycurl] Make Curl instance a local variable not a long-lived object.
99
        curl.setopt(pycurl.URL, abspath)
100
        self._set_curl_options(curl)
2018.2.28 by Andrew Bennetts
Changes in response to review: re-use _base_curl, rather than keeping a seperate _post_curl object; add docstring to test_http.RecordingServer, set is_user_error on some new exceptions.
101
        curl.setopt(pycurl.HTTPGET, 1)
1540.3.3 by Martin Pool
Review updates of pycurl transport
102
        # don't want the body - ie just do a HEAD request
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
103
        # This means "NO BODY" not 'nobody'
1540.3.14 by Martin Pool
[pycurl] Make Curl instance a local variable not a long-lived object.
104
        curl.setopt(pycurl.NOBODY, 1)
2004.1.16 by v.ladeuil+lp at free
Add tests against erroneous http status lines.
105
        # In some erroneous cases, pycurl will emit text on
106
        # stdout if we don't catch it (see InvalidStatus tests
107
        # for one such occurrence).
108
        blackhole = StringIO()
109
        curl.setopt(pycurl.WRITEFUNCTION, blackhole.write)
1540.3.14 by Martin Pool
[pycurl] Make Curl instance a local variable not a long-lived object.
110
        self._curl_perform(curl)
111
        code = curl.getinfo(pycurl.HTTP_CODE)
112
        if code == 404: # not found
113
            return False
114
        elif code in (200, 302): # "ok", "found"
115
            return True
116
        else:
1612.1.1 by Martin Pool
Raise errors correctly on pycurl connection failure
117
            self._raise_curl_http_error(curl)
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
118
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
119
    def _get(self, relpath, ranges, tail_amount=0):
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
120
        # This just switches based on the type of request
121
        if ranges is not None or tail_amount not in (0, None):
122
            return self._get_ranged(relpath, ranges, tail_amount=tail_amount)
123
        else:
124
            return self._get_full(relpath)
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
125
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
126
    def _setup_get_request(self, curl, relpath):
2018.2.6 by Andrew Bennetts
HTTP client starting to work (pycurl for the moment).
127
        # Make sure we do a GET request. versions > 7.14.1 also set the
128
        # NO BODY flag, but we'll do it ourselves in case it is an older
129
        # pycurl version
130
        curl.setopt(pycurl.NOBODY, 0)
131
        curl.setopt(pycurl.HTTPGET, 1)
132
        return self._setup_request(curl, relpath)
133
134
    def _setup_request(self, curl, relpath):
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
135
        """Do the common setup stuff for making a request
136
137
        :param curl: The curl object to place the request on
138
        :param relpath: The relative path that we want to get
139
        :return: (abspath, data, header) 
140
                 abspath: full url
141
                 data: file that will be filled with the body
142
                 header: file that will be filled with the headers
143
        """
144
        abspath = self._real_abspath(relpath)
145
        curl.setopt(pycurl.URL, abspath)
146
        self._set_curl_options(curl)
147
148
        data = StringIO()
149
        header = StringIO()
150
        curl.setopt(pycurl.WRITEFUNCTION, data.write)
151
        curl.setopt(pycurl.HEADERFUNCTION, header.write)
152
153
        return abspath, data, header
154
155
    def _get_full(self, relpath):
156
        """Make a request for the entire file"""
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
157
        curl = self._curl
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
158
        abspath, data, header = self._setup_get_request(curl, relpath)
159
        self._curl_perform(curl)
160
161
        code = curl.getinfo(pycurl.HTTP_CODE)
162
        data.seek(0)
163
164
        if code == 404:
165
            raise NoSuchFile(abspath)
166
        if code != 200:
2004.1.25 by v.ladeuil+lp at free
Shuffle http related test code. Hopefully it ends up at the right place :)
167
            self._raise_curl_http_error(
168
                curl, 'expected 200 or 404 for full response.')
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
169
170
        return code, data
171
172
    def _get_ranged(self, relpath, ranges, tail_amount):
173
        """Make a request for just part of the file."""
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
174
        curl = self._curl
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
175
        abspath, data, header = self._setup_get_request(curl, relpath)
176
2004.1.30 by v.ladeuil+lp at free
Fix #62276 and #62029 by providing a more robust http range handling.
177
        range_header = self.attempted_range_header(ranges, tail_amount)
178
        if range_header is None:
179
            # Forget ranges, the server can't handle them
180
            return self._get_full(relpath)
181
2000.3.8 by v.ladeuil+lp at free
The heart of the modification.
182
        self._curl_perform(curl, ['Range: bytes=%s'
183
                                  % self.range_header(ranges, tail_amount)])
1786.1.33 by John Arbash Meinel
Cleanup pass #2
184
        data.seek(0)
185
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
186
        code = curl.getinfo(pycurl.HTTP_CODE)
1979.1.1 by John Arbash Meinel
Fix bug #57723, parse boundary="" correctly, since Squid uses it
187
        # mutter('header:\n%r', header.getvalue())
1786.1.42 by John Arbash Meinel
Update _extract_headers, make it less generic, and non recursive.
188
        headers = _extract_headers(header.getvalue(), abspath)
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
189
        # handle_response will raise NoSuchFile, etc based on the response code
190
        return code, response.handle_response(abspath, code, headers, data)
1786.1.4 by John Arbash Meinel
Adding HEADERFUNCTION which lets us get any response codes we want.
191
2018.2.6 by Andrew Bennetts
HTTP client starting to work (pycurl for the moment).
192
    def _post(self, body_bytes):
193
        fake_file = StringIO(body_bytes)
2000.3.4 by v.ladeuil+lp at free
Merge bzr.dev
194
        curl = self._curl
2018.2.28 by Andrew Bennetts
Changes in response to review: re-use _base_curl, rather than keeping a seperate _post_curl object; add docstring to test_http.RecordingServer, set is_user_error on some new exceptions.
195
        # Other places that use _base_curl for GET requests explicitly set
196
        # HTTPGET, so it should be safe to re-use the same object for both GETs
197
        # and POSTs.
2018.2.6 by Andrew Bennetts
HTTP client starting to work (pycurl for the moment).
198
        curl.setopt(pycurl.POST, 1)
199
        curl.setopt(pycurl.POSTFIELDSIZE, len(body_bytes))
200
        curl.setopt(pycurl.READFUNCTION, fake_file.read)
201
        abspath, data, header = self._setup_request(curl, '.bzr/smart')
2000.3.4 by v.ladeuil+lp at free
Merge bzr.dev
202
        # We override the Expect: header so that pycurl will send the POST
203
        # body immediately.
204
        self._curl_perform(curl,['Expect: '])
2018.2.6 by Andrew Bennetts
HTTP client starting to work (pycurl for the moment).
205
        data.seek(0)
206
        code = curl.getinfo(pycurl.HTTP_CODE)
207
        headers = _extract_headers(header.getvalue(), abspath)
208
        return code, response.handle_response(abspath, code, headers, data)
209
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
210
    def _raise_curl_http_error(self, curl, info=None):
1612.1.1 by Martin Pool
Raise errors correctly on pycurl connection failure
211
        code = curl.getinfo(pycurl.HTTP_CODE)
212
        url = curl.getinfo(pycurl.EFFECTIVE_URL)
2004.1.27 by v.ladeuil+lp at free
Fix bug #57644 by issuing an explicit error message.
213
        # Some error codes can be handled the same way for all
214
        # requests
215
        if code == 403:
2004.1.34 by v.ladeuil+lp at free
Cosmetic fix for bug #57644.
216
            raise errors.TransportError(
217
                'Server refuses to fullfil the request for: %s' % url)
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
218
        else:
2004.1.27 by v.ladeuil+lp at free
Fix bug #57644 by issuing an explicit error message.
219
            if info is None:
220
                msg = ''
221
            else:
222
                msg = ': ' + info
223
            raise errors.InvalidHttpResponse(
224
                url, 'Unable to handle http code %d%s' % (code,msg))
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
225
1540.3.13 by Martin Pool
Curl should follow http redirects, the same as urllib
226
    def _set_curl_options(self, curl):
227
        """Set options for all requests"""
1540.3.14 by Martin Pool
[pycurl] Make Curl instance a local variable not a long-lived object.
228
        ## curl.setopt(pycurl.VERBOSE, 1)
1616.1.9 by Martin Pool
Set Cache-control: max-age=0 and Pragma: no-cache
229
        # TODO: maybe include a summary of the pycurl version
1786.1.33 by John Arbash Meinel
Cleanup pass #2
230
        ua_str = 'bzr/%s (pycurl)' % (bzrlib.__version__,)
1540.3.15 by Martin Pool
[merge] large merge to sync with bzr.dev
231
        curl.setopt(pycurl.USERAGENT, ua_str)
1540.3.13 by Martin Pool
Curl should follow http redirects, the same as urllib
232
        curl.setopt(pycurl.FOLLOWLOCATION, 1) # follow redirect responses
1540.3.3 by Martin Pool
Review updates of pycurl transport
233
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
234
    def _curl_perform(self, curl, more_headers=[]):
1540.3.3 by Martin Pool
Review updates of pycurl transport
235
        """Perform curl operation and translate exceptions."""
236
        try:
2000.3.1 by v.ladeuil+lp at free
Better connection sharing by using only one curl object.
237
            # There's no way in http/1.0 to say "must
238
            # revalidate"; we don't want to force it to always
239
            # retrieve.  so just turn off the default Pragma
240
            # provided by Curl.
241
            headers = ['Cache-control: max-age=0',
242
                       'Pragma: no-cache',
243
                       'Connection: Keep-Alive']
244
            curl.setopt(pycurl.HTTPHEADER, headers + more_headers)
1540.3.14 by Martin Pool
[pycurl] Make Curl instance a local variable not a long-lived object.
245
            curl.perform()
1540.3.3 by Martin Pool
Review updates of pycurl transport
246
        except pycurl.error, e:
1786.1.35 by John Arbash Meinel
For pycurl inverse of (NOBODY,1) is (HTTPGET,1) not (NOBODY,0)
247
            url = curl.getinfo(pycurl.EFFECTIVE_URL)
248
            mutter('got pycurl error: %s, %s, %s, url: %s ',
249
                    e[0], _pycurl_errors.errorcode[e[0]], e, url)
250
            if e[0] in (_pycurl_errors.CURLE_COULDNT_RESOLVE_HOST,
2051.2.1 by Matthieu Moy
correct handling of proxy error
251
                        _pycurl_errors.CURLE_COULDNT_CONNECT,
2004.1.40 by v.ladeuil+lp at free
Fix the race condition again and correct some small typos to be in
252
                        _pycurl_errors.CURLE_GOT_NOTHING,
253
                        _pycurl_errors.CURLE_COULDNT_RESOLVE_PROXY):
2051.2.1 by Matthieu Moy
correct handling of proxy error
254
                raise ConnectionError('curl connection error (%s)\non %s'
255
                              % (e[1], url))
2000.3.9 by v.ladeuil+lp at free
The tests that would have help avoid bug #73948 and all that mess :)
256
            elif e[0] == _pycurl_errors.CURLE_PARTIAL_FILE:
2180.1.2 by Aaron Bentley
Grammar fixes
257
                # Pycurl itself has detected a short read.  We do
258
                # not have all the information for the
2000.3.9 by v.ladeuil+lp at free
The tests that would have help avoid bug #73948 and all that mess :)
259
                # ShortReadvError, but that should be enough
260
                raise errors.ShortReadvError(url,
261
                                             offset='unknown', length='unknown',
262
                                             actual='unknown',
263
                                             extra='Server aborted the request')
2180.1.2 by Aaron Bentley
Grammar fixes
264
            # jam 20060713 The code didn't use to re-raise the exception here,
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
265
            # but that seemed bogus
266
            raise
1540.3.1 by Martin Pool
First-cut implementation of pycurl. Substantially faster than using urllib.
267
1540.3.10 by Martin Pool
[broken] keep hooking pycurl into test framework
268
269
def get_test_permutations():
270
    """Return the permutations to be used in testing."""
2004.1.25 by v.ladeuil+lp at free
Shuffle http related test code. Hopefully it ends up at the right place :)
271
    from bzrlib.tests.HttpServer import HttpServer_PyCurl
1540.3.24 by Martin Pool
Add new protocol 'http+pycurl' that always uses PyCurl.
272
    return [(PyCurlTransport, HttpServer_PyCurl),
1540.3.10 by Martin Pool
[broken] keep hooking pycurl into test framework
273
            ]