1
# Copyright (C) 2006 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Implementaion of urllib2 tailored to bzr needs
19
This file re-implements the urllib2 class hierarchy with custom classes.
21
For instance, we create a new HTTPConnection and HTTPSConnection that inherit
22
from the original urllib2.HTTP(s)Connection objects, but also have a new base
23
which implements a custom getresponse and fake_close handlers.
25
And then we implement custom HTTPHandler and HTTPSHandler classes, that use
26
the custom HTTPConnection classes.
28
We have a custom Response class, which lets us maintain a keep-alive
29
connection even for requests that urllib2 doesn't expect to contain body data.
31
And a custom Request class that lets us track redirections, and send
32
authentication data without requiring an extra round trip to get rejected by
33
the server. We also create a Request hierarchy, to make it clear what type
34
of request is being made.
39
# TODO: It may be possible to share the password_manager across
40
# all transports by prefixing the realm by the protocol used
41
# (especially if other protocols do not use realms). See
42
# PasswordManager below.
44
# FIXME: Oversimplifying, two kind of exceptions should be
45
# raised, once a request is issued: URLError before we have been
46
# able to process the response, HTTPError after that. Process the
47
# response means we are able to leave the socket clean, so if we
48
# are not able to do that, we should close the connection. The
49
# actual code more or less do that, tests should be written to
60
from bzrlib import __version__ as bzrlib_version
61
from bzrlib import errors
64
# We define our own Response class to keep our httplib pipe clean
65
class Response(httplib.HTTPResponse):
66
"""Custom HTTPResponse, to avoid the need to decorate.
68
httplib prefers to decorate the returned objects, rather
69
than using a custom object.
72
# Some responses have bodies in which we have no interest
73
_body_ignored_responses = [301,302, 303, 307, 401, 403, 404]
75
def __init__(self, *args, **kwargs):
76
httplib.HTTPResponse.__init__(self, *args, **kwargs)
79
"""Begin to read the response from the server.
81
httplib assumes that some responses get no content and do
82
not even attempt to read the body in that case, leaving
83
the body in the socket, blocking the next request. Let's
84
try to workaround that.
86
httplib.HTTPResponse.begin(self)
87
if self.status in self._body_ignored_responses:
88
if self.debuglevel > 0:
89
print "For status: [%s]," % self.status,
90
print "will ready body, length: ",
91
if self.length is not None:
92
print "[%d]" % self.length
95
if not (self.length is None or self.will_close):
96
# In some cases, we just can't read the body not
97
# even try or we may encounter a 104, 'Connection
98
# reset by peer' error if there is indeed no body
99
# and the server closed the connection just after
100
# having issued the response headers (even if the
101
# headers indicate a Content-Type...)
102
body = self.fp.read(self.length)
103
if self.debuglevel > 0:
104
print "Consumed body: [%s]" % body
108
# Not inheriting from 'object' because httplib.HTTPConnection doesn't.
109
class AbstractHTTPConnection:
110
"""A custom HTTP(S) Connection, which can reset itself on a bad response"""
112
response_class = Response
113
strict = 1 # We don't support HTTP/0.9
115
def fake_close(self):
116
"""Make the connection believes the response have been fully handled.
118
That makes the httplib.HTTPConnection happy
120
# Preserve our preciousss
127
class HTTPConnection(AbstractHTTPConnection, httplib.HTTPConnection):
131
class HTTPSConnection(AbstractHTTPConnection, httplib.HTTPSConnection):
135
class Request(urllib2.Request):
136
"""A custom Request object.
138
urllib2 determines the request method heuristically (based on
139
the presence or absence of data). We set the method
142
Also, the Request object tracks the connection the request will
146
def __init__(self, method, url, data=None, headers={},
147
origin_req_host=None, unverifiable=False,
148
connection=None, parent=None,):
149
# urllib2.Request will be confused if we don't extract
150
# authentification info before building the request
151
url, self.user, self.password = self.extract_auth(url)
152
urllib2.Request.__init__(self, url, data, headers,
153
origin_req_host, unverifiable)
155
self.connection = connection
156
# To handle redirections
158
self.redirected_to = None
160
def extract_auth(self, url):
161
"""Extracts authentification information from url.
163
Get user and password from url of the form: http://user:pass@host/path
165
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
168
auth, netloc = netloc.split('@', 1)
170
user, password = auth.split(':', 1)
172
user, password = auth, None
173
user = urllib.unquote(user)
174
if password is not None:
175
password = urllib.unquote(password)
180
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
182
return url, user, password
184
def get_method(self):
188
# The urlib2.xxxAuthHandler handle the authentification of the
189
# requests, to do that, they need an urllib2 PasswordManager *at
190
# build time*. We also need one to reuse the passwords already
192
class PasswordManager(urllib2.HTTPPasswordMgrWithDefaultRealm):
195
urllib2.HTTPPasswordMgrWithDefaultRealm.__init__(self)
198
class ConnectionHandler(urllib2.BaseHandler):
199
"""Provides connection-sharing by pre-processing requests.
201
urllib2 provides no way to access the HTTPConnection object
202
internally used. But we need it in order to achieve
203
connection sharing. So, we add it to the request just before
204
it is processed, and then we override the do_open method for
208
handler_order = 1000 # after all pre-processings
210
def get_key(self, connection):
211
"""Returns the key for the connection in the cache"""
212
return '%s:%d' % (connection.host, connection.port)
214
def create_connection(self, request, http_connection_class):
215
host = request.get_host()
217
# Just a bit of paranoia here, this should have been
218
# handled in the higher levels
219
raise errors.InvalidURL(request.get_full_url(), 'no host given.')
221
# We create a connection (but it will not connect yet)
223
connection = http_connection_class(host)
224
except httplib.InvalidURL, exception:
225
# There is only one occurrence of InvalidURL in httplib
226
raise errors.InvalidURL(request.get_full_url(),
227
extra='nonnumeric port')
231
def capture_connection(self, request, http_connection_class):
232
"""Capture or inject the request connection.
235
- the request have no connection: create a new one,
237
- the request have a connection: this one have been used
238
already, let's capture it, so that we can give it to
239
another transport to be reused. We don't do that
240
ourselves: the Transport object get the connection from
241
a first request and then propagate it, from request to
242
request or to cloned transports.
244
connection = request.connection
245
if connection is None:
247
connection = self.create_connection(request, http_connection_class)
248
request.connection = connection
250
# All connections will pass here, propagate debug level
251
connection.set_debuglevel(DEBUG)
254
def http_request(self, request):
255
return self.capture_connection(request, HTTPConnection)
257
def https_request(self, request):
258
return self.capture_connection(request, HTTPSConnection)
261
class AbstractHTTPHandler(urllib2.AbstractHTTPHandler):
262
"""A custom handler for HTTP(S) requests.
264
We overrive urllib2.AbstractHTTPHandler to get a better
265
control of the connection, the ability to implement new
266
request types and return a response able to cope with
267
persistent connections.
270
# We change our order to be before urllib2 HTTP[S]Handlers
271
# and be chosen instead of them (the first http_open called
275
_default_headers = {'Pragma': 'no-cache',
276
'Cache-control': 'max-age=0',
277
'Connection': 'Keep-Alive',
278
# FIXME: Spell it User-*A*gent once we
279
# know how to properly avoid bogus
280
# urllib2 using capitalize() for headers
281
# instead of title(sp?).
282
'User-agent': 'bzr/%s (urllib)' % bzrlib_version,
283
# FIXME: pycurl also set the following, understand why
288
urllib2.AbstractHTTPHandler.__init__(self, debuglevel=DEBUG)
290
def http_request(self, request):
291
"""Common headers setting"""
293
request.headers.update(self._default_headers.copy())
294
# FIXME: We may have to add the Content-Length header if
295
# we have data to send.
298
def retry_or_raise(self, http_class, request, first_try):
299
"""Retry the request (once) or raise the exception.
301
urllib2 raises exception of application level kind, we
302
just have to translate them.
304
httplib can raise exceptions of transport level (badly
305
formatted dialog, loss of connexion or socket level
306
problems). In that case we should issue the request again
307
(httplib will close and reopen a new connection if
310
# When an exception occurs, we give back the original
311
# Traceback or the bugs are hard to diagnose.
312
exc_type, exc_val, exc_tb = sys.exc_info()
313
if exc_type == socket.gaierror:
314
# No need to retry, that will not help
315
raise errors.ConnectionError("Couldn't resolve host '%s'"
316
% request.get_origin_req_host(),
320
if self._debuglevel > 0:
321
print 'Received exception: [%r]' % exc_val
322
print ' On connection: [%r]' % request.connection
323
method = request.get_method()
324
url = request.get_full_url()
325
print ' Will retry, %s %r' % (method, url)
326
request.connection.close()
327
response = self.do_open(http_class, request, False)
328
convert_to_addinfourl = False
330
if self._debuglevel > 0:
331
print 'Received second exception: [%r]' % exc_val
332
print ' On connection: [%r]' % request.connection
333
if exc_type in (httplib.BadStatusLine, httplib.UnknownProtocol):
334
# httplib.BadStatusLine and
335
# httplib.UnknownProtocol indicates that a
336
# bogus server was encountered or a bad
337
# connection (i.e. transient errors) is
338
# experimented, we have already retried once
339
# for that request so we raise the exception.
340
my_exception = errors.InvalidHttpResponse(
341
request.get_full_url(),
342
'Bad status line received',
345
# All other exception are considered connection related.
347
# httplib.HTTPException should indicate a bug
348
# in the urllib implementation, somewhow the
349
# httplib pipeline is in an incorrect state,
350
# we retry in hope that this will correct the
351
# problem but that may need investigation
352
# (note that no such bug is known as of
355
# socket errors generally occurs for reasons
356
# far outside our scope, so closing the
357
# connection and retrying is the best we can
360
# FIXME: and then there is HTTPError raised by:
361
# - HTTPDefaultErrorHandler (we define our own)
362
# - HTTPRedirectHandler.redirect_request
363
# - AbstractDigestAuthHandler.http_error_auth_reqed
365
my_exception = errors.ConnectionError(
366
msg= 'while sending %s %s:' % (request.get_method(),
367
request.get_selector()),
370
if self._debuglevel > 0:
371
print 'On connection: [%r]' % request.connection
372
method = request.get_method()
373
url = request.get_full_url()
374
print ' Failed again, %s %r' % (method, url)
375
print ' Will raise: [%r]' % my_exception
376
raise my_exception, None, exc_tb
377
return response, convert_to_addinfourl
379
def do_open(self, http_class, request, first_try=True):
380
"""See urllib2.AbstractHTTPHandler.do_open for the general idea.
382
The request will be retried once if it fails.
384
connection = request.connection
385
assert connection is not None, \
386
'Cannot process a request without a connection'
388
# Get all the headers
390
headers.update(request.header_items())
391
headers.update(request.unredirected_hdrs)
394
connection._send_request(request.get_method(),
395
request.get_selector(),
396
# FIXME: implements 100-continue
397
#None, # We don't send the body yet
400
if self._debuglevel > 0:
401
print 'Request sent: [%r]' % request
402
response = connection.getresponse()
403
convert_to_addinfourl = True
404
except (socket.gaierror, httplib.BadStatusLine, httplib.UnknownProtocol,
405
socket.error, httplib.HTTPException):
406
response, convert_to_addinfourl = self.retry_or_raise(http_class,
410
# FIXME: HTTPConnection does not fully support 100-continue (the
411
# server responses are just ignored)
414
# mutter('Will send the body')
415
# # We can send the body now
416
# body = request.get_data()
418
# raise URLError("No data given")
419
# connection.send(body)
420
# response = connection.getresponse()
422
if self._debuglevel > 0:
423
print 'Receives response: %r' % response
424
print ' For: %r(%r)' % (request.get_method(),
425
request.get_full_url())
427
if convert_to_addinfourl:
428
# Shamelessly copied from urllib2
432
fp = socket._fileobject(r)
433
resp = urllib2.addinfourl(fp, r.msg, req.get_full_url())
436
if self._debuglevel > 0:
437
print 'Create addinfourl: %r' % resp
438
print ' For: %r(%r)' % (request.get_method(),
439
request.get_full_url())
444
# # we need titled headers in a dict but
445
# # response.getheaders returns a list of (lower(header).
446
# # Let's title that because most of bzr handle titled
447
# # headers, but maybe we should switch to lowercased
449
# # jam 20060908: I think we actually expect the headers to
450
# # be similar to mimetools.Message object, which uses
451
# # case insensitive keys. It lowers() all requests.
452
# # My concern is that the code may not do perfect title case.
453
# # For example, it may use Content-type rather than Content-Type
455
# # When we get rid of addinfourl, we must ensure that bzr
456
# # always use titled headers and that any header received
457
# # from server is also titled.
460
# for header, value in (response.getheaders()):
461
# headers[header.title()] = value
462
# # FIXME: Implements a secured .read method
463
# response.code = response.status
464
# response.headers = headers
468
class HTTPHandler(AbstractHTTPHandler):
469
"""A custom handler that just thunks into HTTPConnection"""
471
def http_open(self, request):
472
return self.do_open(HTTPConnection, request)
475
class HTTPSHandler(AbstractHTTPHandler):
476
"""A custom handler that just thunks into HTTPSConnection"""
478
def https_open(self, request):
479
return self.do_open(HTTPSConnection, request)
482
class HTTPRedirectHandler(urllib2.HTTPRedirectHandler):
483
"""Handles redirect requests.
485
We have to implement our own scheme because we use a specific
486
Request object and because we want to implement a specific
490
# RFC2616 says that only read requests should be redirected
491
# without interacting with the user. But bzr use some
492
# shortcuts to optimize against roundtrips which can leads to
493
# write requests being issued before read requests of
494
# containing dirs can be redirected. So we redirect write
495
# requests in the same way which seems to respect the spirit
496
# of the RFC if not its letter.
498
def redirect_request(self, req, fp, code, msg, headers, newurl):
499
"""See urllib2.HTTPRedirectHandler.redirect_request"""
500
# We would have preferred to update the request instead
501
# of creating a new one, but the urllib2.Request object
502
# has a too complicated creation process to provide a
503
# simple enough equivalent update process. Instead, when
504
# redirecting, we only update the original request with a
505
# reference to the following request in the redirect
508
# Some codes make no sense on out context and are treated
511
# 300: Multiple choices for different representations of
512
# the URI. Using that mechanisn with bzr will violate the
513
# protocol neutrality of Transport.
515
# 304: Not modified (SHOULD only occurs with conditional
516
# GETs which are not used by our implementation)
518
# 305: Use proxy. I can't imagine this one occurring in
519
# our context-- vila/20060909
521
# 306: Unused (if the RFC says so...)
523
# FIXME: If the code is 302 and the request is HEAD, we
524
# MAY avoid following the redirections if the intent is
525
# to check the existence, we have a hint that the file
526
# exist, now if we want to be sure, we must follow the
527
# redirection. Let's do that for now.
529
if code in (301, 302, 303, 307):
530
return Request(req.get_method(),newurl,
531
headers = req.headers,
532
origin_req_host = req.get_origin_req_host(),
534
# TODO: It will be nice to be able to
535
# detect virtual hosts sharing the same
536
# IP address, that will allow us to
537
# share the same connection...
542
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
544
def http_error_30x(self, req, fp, code, msg, headers):
545
"""Requests the redirected to URI.
547
Copied from urllib2 to be able to fake_close the
548
associated connection, *before* issuing the redirected
549
request but *after* having eventually raised an error.
551
# Some servers (incorrectly) return multiple Location headers
552
# (so probably same goes for URI). Use first header.
554
# TODO: Once we get rid of addinfourl objects, the
555
# following will need to be updated to use correct case
557
if 'location' in headers:
558
newurl = headers.getheaders('location')[0]
559
elif 'uri' in headers:
560
newurl = headers.getheaders('uri')[0]
563
if self._debuglevel > 0:
564
print 'Redirected to: %s' % newurl
565
newurl = urlparse.urljoin(req.get_full_url(), newurl)
567
# This call succeeds or raise an error. urllib2 returns
568
# if redirect_request returns None, but our
569
# redirect_request never returns None.
570
redirected_req = self.redirect_request(req, fp, code, msg, headers,
574
# .redirect_dict has a key url if url was previously visited.
575
if hasattr(req, 'redirect_dict'):
576
visited = redirected_req.redirect_dict = req.redirect_dict
577
if (visited.get(newurl, 0) >= self.max_repeats or
578
len(visited) >= self.max_redirections):
579
raise urllib2.HTTPError(req.get_full_url(), code,
580
self.inf_msg + msg, headers, fp)
582
visited = redirected_req.redirect_dict = req.redirect_dict = {}
583
visited[newurl] = visited.get(newurl, 0) + 1
585
# We can close the fp now that we are sure that we won't
586
# use it with HTTPError.
588
# We have all we need already in the response
589
req.connection.fake_close()
591
return self.parent.open(redirected_req)
593
http_error_302 = http_error_303 = http_error_307 = http_error_30x
595
def http_error_301(self, req, fp, code, msg, headers):
596
response = self.http_error_30x(req, fp, code, msg, headers)
597
# If one or several 301 response occur during the
598
# redirection chain, we MUST update the original request
599
# to indicate where the URI where finally found.
602
while original_req.parent is not None:
603
original_req = original_req.parent
604
if original_req.redirected_to is None:
605
# Only the last occurring 301 should be taken
606
# into account i.e. the first occurring here when
607
# redirected_to has not yet been set.
608
original_req.redirected_to = redirected_url
612
class ProxyHandler(urllib2.ProxyHandler):
613
"""Handles proxy setting.
615
Copied and modified from urllib2 to be able to modify the
616
request during the request pre-processing instead of
617
modifying it at _open time. As we capture (or create) the
618
connection object during request processing, _open time was
621
Note that the proxy handling *may* modify the protocol used;
622
the request may be against an https server proxied through an
623
http proxy. So, https_request will be called, but later it's
624
really http_open that will be called. This explain why we
625
don't have to call self.parent.open as the urllib2 did.
628
# Proxies must be in front
632
def __init__(self, proxies=None):
633
urllib2.ProxyHandler.__init__(self, proxies)
634
# First, let's get rid of urllib2 implementation
635
for type, proxy in self.proxies.items():
636
if self._debuglevel > 0:
637
print 'Will unbind %s_open for %r' % (type, proxy)
638
delattr(self, '%s_open' % type)
640
# We are interested only by the http[s] proxies
641
http_proxy = self.get_proxy_env_var('http')
642
https_proxy = self.get_proxy_env_var('https')
644
if http_proxy is not None:
645
if self._debuglevel > 0:
646
print 'Will bind http_request for %r' % http_proxy
647
setattr(self, 'http_request',
648
lambda request: self.set_proxy(request, 'http'))
650
if https_proxy is not None:
651
if self._debuglevel > 0:
652
print 'Will bind http_request for %r' % https_proxy
653
setattr(self, 'https_request',
654
lambda request: self.set_proxy(request, 'https'))
656
def get_proxy_env_var(self, name, default_to='all'):
657
"""Get a proxy env var.
659
Note that we indirectly rely on
660
urllib.getproxies_environment taking into account the
661
uppercased values for proxy variables.
664
return self.proxies[name.lower()]
666
if default_to is not None:
667
# Try to get the alternate environment variable
669
return self.proxies[default_to]
674
def proxy_bypass(self, host):
675
"""Check if host should be proxied or not"""
676
no_proxy = self.get_proxy_env_var('no', None)
679
hhost, hport = urllib.splitport(host)
680
# Does host match any of the domains mentioned in
681
# no_proxy ? The rules about what is authorized in no_proxy
682
# are fuzzy (to say the least). We try to allow most
683
# commonly seen values.
684
for domain in no_proxy.split(','):
685
dhost, dport = urllib.splitport(domain)
686
if hport == dport or dport is None:
688
dhost = dhost.replace(".", r"\.")
689
dhost = dhost.replace("*", r".*")
690
dhost = dhost.replace("?", r".")
691
if re.match(dhost, hhost, re.IGNORECASE):
693
# Nevertheless, there are platform-specific ways to
695
return urllib.proxy_bypass(host)
697
def set_proxy(self, request, type):
698
if self.proxy_bypass(request.get_host()):
701
proxy = self.get_proxy_env_var(type)
702
if self._debuglevel > 0:
703
print 'set_proxy %s_request for %r' % (type, proxy)
704
orig_type = request.get_type()
705
type, r_type = urllib.splittype(proxy)
706
host, XXX = urllib.splithost(r_type)
708
user_pass, host = host.split('@', 1)
710
user, password = user_pass.split(':', 1)
711
user_pass = '%s:%s' % (urllib.unquote(user),
712
urllib.unquote(password))
713
user_pass.encode('base64').strip()
714
req.add_header('Proxy-authorization', 'Basic ' + user_pass)
715
host = urllib.unquote(host)
716
request.set_proxy(host, type)
717
if self._debuglevel > 0:
718
print 'set_proxy: proxy set to %r://%r' % (type, host)
722
class HTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
723
"""Custom basic authentification handler.
725
Send the authentification preventively to avoid the the
726
roundtrip associated with the 401 error.
729
# def http_request(self, request):
730
# """Insert an authentification header if information is available"""
731
# if request.auth == 'basic' and request.password is not None:
736
class HTTPErrorProcessor(urllib2.HTTPErrorProcessor):
737
"""Process HTTP error responses.
739
We don't really process the errors, quite the contrary
740
instead, we leave our Transport handle them.
742
handler_order = 1000 # after all other processing
744
def http_response(self, request, response):
745
code, msg, hdrs = response.code, response.msg, response.info()
747
if code not in (200, # Ok
748
206, # Partial content
751
response = self.parent.error('http', request, response,
755
https_response = http_response
758
class HTTPDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
759
"""Translate common errors into bzr Exceptions"""
761
def http_error_default(self, req, fp, code, msg, hdrs):
763
raise errors.NoSuchFile(req.get_selector(),
764
extra=HTTPError(req.get_full_url(),
768
raise errors.TransportError('Server refuses to fullfil the request')
770
# We don't know which, but one of the ranges we
771
# specified was wrong. So we raise with 0 for a lack
772
# of a better magic value.
773
raise errors.InvalidRange(req.get_full_url(),0)
775
# TODO: A test is needed to exercise that code path
776
raise errors.InvalidHttpResponse(req.get_full_url(),
777
'Unable to handle http code %d: %s'
780
class Opener(object):
781
"""A wrapper around urllib2.build_opener
783
Daughter classes can override to build their own specific opener
785
# TODO: Provides hooks for daughter classes.
788
connection=ConnectionHandler,
789
redirect=HTTPRedirectHandler,
790
error=HTTPErrorProcessor,):
791
self.password_manager = PasswordManager()
792
# TODO: Implements the necessary wrappers for the handlers
793
# commented out below
794
self._opener = urllib2.build_opener( \
795
connection, redirect, error,
797
urllib2.HTTPBasicAuthHandler(self.password_manager),
798
#urllib2.HTTPDigestAuthHandler(self.password_manager),
799
#urllib2.ProxyBasicAuthHandler,
800
#urllib2.ProxyDigestAuthHandler,
803
HTTPDefaultErrorHandler,
805
self.open = self._opener.open
807
# When dealing with handler order, it's easy to mess
808
# things up, the following will help understand which
809
# handler is used, when and for what.
811
pprint.pprint(self._opener.__dict__)