1
# Copyright (C) 2006 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Implementaion of urllib2 tailored to bzr needs
19
This file re-implements the urllib2 class hierarchy with custom classes.
21
For instance, we create a new HTTPConnection and HTTPSConnection that inherit
22
from the original urllib2.HTTP(s)Connection objects, but also have a new base
23
which implements a custom getresponse and fake_close handlers.
25
And then we implement custom HTTPHandler and HTTPSHandler classes, that use
26
the custom HTTPConnection classes.
28
We have a custom Response class, which lets us maintain a keep-alive
29
connection even for requests that urllib2 doesn't expect to contain body data.
31
And a custom Request class that lets us track redirections, and send
32
authentication data without requiring an extra round trip to get rejected by
33
the server. We also create a Request hierarchy, to make it clear what type
34
of request is being made.
39
# TODO: It may be possible to share the password_manager across
40
# all transports by prefixing the realm by the protocol used
41
# (especially if other protocols do not use realms). See
42
# PasswordManager below.
44
# FIXME: Oversimplifying, two kind of exceptions should be
45
# raised, once a request is issued: URLError before we have been
46
# able to process the response, HTTPError after that. Process the
47
# response means we are able to leave the socket clean, so if we
48
# are not able to do that, we should close the connection. The
49
# actual code more or less do that, tests should be written to
59
from bzrlib import __version__ as bzrlib_version
60
from bzrlib import errors
63
# We define our own Response class to keep our httplib pipe clean
64
class Response(httplib.HTTPResponse):
65
"""Custom HTTPResponse, to avoid the need to decorate.
67
httplib prefers to decorate the returned objects, rather
68
than using a custom object.
71
# Some responses have bodies in which we have no interest
72
_body_ignored_responses = [301,302, 303, 307, 401, 403, 404]
74
def __init__(self, *args, **kwargs):
75
httplib.HTTPResponse.__init__(self, *args, **kwargs)
78
"""Begin to read the response from the server.
80
httplib assumes that some responses get no content and do
81
not even attempt to read the body in that case, leaving
82
the body in the socket, blocking the next request. Let's
83
try to workaround that.
85
httplib.HTTPResponse.begin(self)
86
if self.status in self._body_ignored_responses:
87
if self.debuglevel > 0:
88
print "For status: [%s]," % self.status,
89
print "will ready body, length: ",
90
if self.length is not None:
91
print "[%d]" % self.length
94
if not (self.length is None or self.will_close):
95
# In some cases, we just can't read the body not
96
# even try or we may encounter a 104, 'Connection
97
# reset by peer' error if there is indeed no body
98
# and the server closed the connection just after
99
# having issued the response headers (even if the
100
# headers indicate a Content-Type...)
101
body = self.fp.read(self.length)
102
if self.debuglevel > 0:
103
print "Consumed body: [%s]" % body
107
# Not inheriting from 'object' because httplib.HTTPConnection doesn't.
108
class AbstractHTTPConnection:
109
"""A custom HTTP(S) Connection, which can reset itself on a bad response"""
111
response_class = Response
112
strict = 1 # We don't support HTTP/0.9
114
def fake_close(self):
115
"""Make the connection believes the response have been fully handled.
117
That makes the httplib.HTTPConnection happy
119
# Preserve our preciousss
126
class HTTPConnection(AbstractHTTPConnection, httplib.HTTPConnection):
130
class HTTPSConnection(AbstractHTTPConnection, httplib.HTTPSConnection):
134
class Request(urllib2.Request):
135
"""A custom Request object.
137
urllib2 determines the request method heuristically (based on
138
the presence or absence of data). We set the method
141
Also, the Request object tracks the connection the request will
145
def __init__(self, method, url, data=None, headers={},
146
origin_req_host=None, unverifiable=False,
147
connection=None, parent=None,):
148
# urllib2.Request will be confused if we don't extract
149
# authentification info before building the request
150
url, self.user, self.password = self.extract_auth(url)
151
urllib2.Request.__init__(self, url, data, headers,
152
origin_req_host, unverifiable)
154
self.connection = connection
155
# To handle redirections
157
self.redirected_to = None
159
def extract_auth(self, url):
160
"""Extracts authentification information from url.
162
Get user and password from url of the form: http://user:pass@host/path
164
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
167
auth, netloc = netloc.split('@', 1)
169
user, password = auth.split(':', 1)
171
user, password = auth, None
172
user = urllib.unquote(user)
173
if password is not None:
174
password = urllib.unquote(password)
179
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
181
return url, user, password
183
def get_method(self):
187
# The urlib2.xxxAuthHandler handle the authentification of the
188
# requests, to do that, they need an urllib2 PasswordManager *at
189
# build time*. We also need one to reuse the passwords already
191
class PasswordManager(urllib2.HTTPPasswordMgrWithDefaultRealm):
194
urllib2.HTTPPasswordMgrWithDefaultRealm.__init__(self)
197
class ConnectionHandler(urllib2.BaseHandler):
198
"""Provides connection-sharing by pre-processing requests.
200
urllib2 provides no way to access the HTTPConnection object
201
internally used. But we need it in order to achieve
202
connection sharing. So, we add it to the request just before
203
it is processed, and then we override the do_open method for
207
handler_order = 1000 # after all pre-processings
209
def get_key(self, connection):
210
"""Returns the key for the connection in the cache"""
211
return '%s:%d' % (connection.host, connection.port)
213
def create_connection(self, request, http_connection_class):
214
host = request.get_host()
216
# Just a bit of paranoia here, this should have been
217
# handled in the higher levels
218
raise errors.InvalidURL(request.get_full_url(), 'no host given.')
220
# We create a connection (but it will not connect yet)
222
connection = http_connection_class(host)
223
except httplib.InvalidURL, exception:
224
# There is only one occurrence of InvalidURL in httplib
225
raise errors.InvalidURL(request.get_full_url(),
226
extra='nonnumeric port')
230
def capture_connection(self, request, http_connection_class):
231
"""Capture or inject the request connection.
234
- the request have no connection: create a new one,
236
- the request have a connection: this one have been used
237
already, let's capture it, so that we can give it to
238
another transport to be reused. We don't do that
239
ourselves: the Transport object get the connection from
240
a first request and then propagate it, from request to
241
request or to cloned transports.
243
connection = request.connection
244
if connection is None:
246
connection = self.create_connection(request, http_connection_class)
247
request.connection = connection
249
# All connections will pass here, propagate debug level
250
connection.set_debuglevel(DEBUG)
253
def http_request(self, request):
254
return self.capture_connection(request, HTTPConnection)
256
def https_request(self, request):
257
return self.capture_connection(request, HTTPSConnection)
260
class AbstractHTTPHandler(urllib2.AbstractHTTPHandler):
261
"""A custom handler for HTTP(S) requests.
263
We overrive urllib2.AbstractHTTPHandler to get a better
264
control of the connection, the ability to implement new
265
request types and return a response able to cope with
266
persistent connections.
269
# We change our order to be before urllib2 HTTP[S]Handlers
270
# and be chosen instead of them (the first http_open called
274
_default_headers = {'Pragma': 'no-cache',
275
'Cache-control': 'max-age=0',
276
'Connection': 'Keep-Alive',
277
# FIXME: Spell it User-*A*gent once we
278
# know how to properly avoid bogus
279
# urllib2 using capitalize() for headers
280
# instead of title(sp?).
281
'User-agent': 'bzr/%s (urllib)' % bzrlib_version,
282
# FIXME: pycurl also set the following, understand why
287
urllib2.AbstractHTTPHandler.__init__(self, debuglevel=DEBUG)
289
def http_request(self, request):
290
"""Common headers setting"""
292
request.headers.update(self._default_headers.copy())
293
# FIXME: We may have to add the Content-Length header if
294
# we have data to send.
297
def retry_or_raise(self, http_class, request, first_try):
298
"""Retry the request (once) or raise the exception.
300
urllib2 raises exception of application level kind, we
301
just have to translate them.
303
httplib can raise exceptions of transport level (badly
304
formatted dialog, loss of connexion or socket level
305
problems). In that case we should issue the request again
306
(httplib will close and reopen a new connection if
309
# When an exception occurs, we give back the original
310
# Traceback or the bugs are hard to diagnose.
311
exc_type, exc_val, exc_tb = sys.exc_info()
312
if exc_type == socket.gaierror:
313
# No need to retry, that will not help
314
raise errors.ConnectionError("Couldn't resolve host '%s'"
315
% request.get_origin_req_host(),
319
if self._debuglevel > 0:
320
print 'Received exception: [%r]' % exc_val
321
print ' On connection: [%r]' % request.connection
322
method = request.get_method()
323
url = request.get_full_url()
324
print ' Will retry, %s %r' % (method, url)
325
request.connection.close()
326
response = self.do_open(http_class, request, False)
327
convert_to_addinfourl = False
329
if self._debuglevel > 0:
330
print 'Received second exception: [%r]' % exc_val
331
print ' On connection: [%r]' % request.connection
332
if exc_type in (httplib.BadStatusLine, httplib.UnknownProtocol):
333
# httplib.BadStatusLine and
334
# httplib.UnknownProtocol indicates that a
335
# bogus server was encountered or a bad
336
# connection (i.e. transient errors) is
337
# experimented, we have already retried once
338
# for that request so we raise the exception.
339
my_exception = errors.InvalidHttpResponse(
340
request.get_full_url(),
341
'Bad status line received',
344
# All other exception are considered connection related.
346
# httplib.HTTPException should indicate a bug
347
# in the urllib implementation, somewhow the
348
# httplib pipeline is in an incorrect state,
349
# we retry in hope that this will correct the
350
# problem but that may need investigation
351
# (note that no such bug is known as of
354
# socket errors generally occurs for reasons
355
# far outside our scope, so closing the
356
# connection and retrying is the best we can
359
# FIXME: and then there is HTTPError raised by:
360
# - HTTPDefaultErrorHandler (we define our own)
361
# - HTTPRedirectHandler.redirect_request
362
# - AbstractDigestAuthHandler.http_error_auth_reqed
364
my_exception = errors.ConnectionError(
365
msg= 'while sending %s %s:' % (request.get_method(),
366
request.get_selector()),
369
if self._debuglevel > 0:
370
print 'On connection: [%r]' % request.connection
371
method = request.get_method()
372
url = request.get_full_url()
373
print ' Failed again, %s %r' % (method, url)
374
print ' Will raise: [%r]' % my_exception
375
raise my_exception, None, exc_tb
376
return response, convert_to_addinfourl
378
def do_open(self, http_class, request, first_try=True):
379
"""See urllib2.AbstractHTTPHandler.do_open for the general idea.
381
The request will be retried once if it fails.
383
connection = request.connection
384
assert connection is not None, \
385
'Cannot process a request without a connection'
387
# Get all the headers
389
headers.update(request.header_items())
390
headers.update(request.unredirected_hdrs)
393
connection._send_request(request.get_method(),
394
request.get_selector(),
395
# FIXME: implements 100-continue
396
#None, # We don't send the body yet
399
if self._debuglevel > 0:
400
print 'Request sent: [%r]' % request
401
response = connection.getresponse()
402
convert_to_addinfourl = True
403
except (socket.gaierror, httplib.BadStatusLine, httplib.UnknownProtocol,
404
socket.error, httplib.HTTPException):
405
response, convert_to_addinfourl = self.retry_or_raise(http_class,
409
# FIXME: HTTPConnection does not fully support 100-continue (the
410
# server responses are just ignored)
413
# mutter('Will send the body')
414
# # We can send the body now
415
# body = request.get_data()
417
# raise URLError("No data given")
418
# connection.send(body)
419
# response = connection.getresponse()
421
if self._debuglevel > 0:
422
print 'Receives response: %r' % response
423
print ' For: %r(%r)' % (request.get_method(),
424
request.get_full_url())
426
if convert_to_addinfourl:
427
# Shamelessly copied from urllib2
431
fp = socket._fileobject(r)
432
resp = urllib2.addinfourl(fp, r.msg, req.get_full_url())
435
if self._debuglevel > 0:
436
print 'Create addinfourl: %r' % resp
437
print ' For: %r(%r)' % (request.get_method(),
438
request.get_full_url())
443
# # we need titled headers in a dict but
444
# # response.getheaders returns a list of (lower(header).
445
# # Let's title that because most of bzr handle titled
446
# # headers, but maybe we should switch to lowercased
448
# # jam 20060908: I think we actually expect the headers to
449
# # be similar to mimetools.Message object, which uses
450
# # case insensitive keys. It lowers() all requests.
451
# # My concern is that the code may not do perfect title case.
452
# # For example, it may use Content-type rather than Content-Type
454
# # When we get rid of addinfourl, we must ensure that bzr
455
# # always use titled headers and that any header received
456
# # from server is also titled.
459
# for header, value in (response.getheaders()):
460
# headers[header.title()] = value
461
# # FIXME: Implements a secured .read method
462
# response.code = response.status
463
# response.headers = headers
467
class HTTPHandler(AbstractHTTPHandler):
468
"""A custom handler that just thunks into HTTPConnection"""
470
def http_open(self, request):
471
return self.do_open(HTTPConnection, request)
474
class HTTPSHandler(AbstractHTTPHandler):
475
"""A custom handler that just thunks into HTTPSConnection"""
477
def https_open(self, request):
478
return self.do_open(HTTPSConnection, request)
481
class HTTPRedirectHandler(urllib2.HTTPRedirectHandler):
482
"""Handles redirect requests.
484
We have to implement our own scheme because we use a specific
485
Request object and because we want to implement a specific
489
# RFC2616 says that only read requests should be redirected
490
# without interacting with the user. But bzr use some
491
# shortcuts to optimize against roundtrips which can leads to
492
# write requests being issued before read requests of
493
# containing dirs can be redirected. So we redirect write
494
# requests in the same way which seems to respect the spirit
495
# of the RFC if not its letter.
497
def redirect_request(self, req, fp, code, msg, headers, newurl):
498
"""See urllib2.HTTPRedirectHandler.redirect_request"""
499
# We would have preferred to update the request instead
500
# of creating a new one, but the urllib2.Request object
501
# has a too complicated creation process to provide a
502
# simple enough equivalent update process. Instead, when
503
# redirecting, we only update the original request with a
504
# reference to the following request in the redirect
507
# Some codes make no sense on out context and are treated
510
# 300: Multiple choices for different representations of
511
# the URI. Using that mechanisn with bzr will violate the
512
# protocol neutrality of Transport.
514
# 304: Not modified (SHOULD only occurs with conditional
515
# GETs which are not used by our implementation)
517
# 305: Use proxy. I can't imagine this one occurring in
518
# our context-- vila/20060909
520
# 306: Unused (if the RFC says so...)
522
# FIXME: If the code is 302 and the request is HEAD, we
523
# MAY avoid following the redirections if the intent is
524
# to check the existence, we have a hint that the file
525
# exist, now if we want to be sure, we must follow the
526
# redirection. Let's do that for now.
528
if code in (301, 302, 303, 307):
529
return Request(req.get_method(),newurl,
530
headers = req.headers,
531
origin_req_host = req.get_origin_req_host(),
533
# TODO: It will be nice to be able to
534
# detect virtual hosts sharing the same
535
# IP address, that will allow us to
536
# share the same connection...
541
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
543
def http_error_30x(self, req, fp, code, msg, headers):
544
"""Requests the redirected to URI.
546
Copied from urllib2 to be able to fake_close the
547
associated connection, *before* issuing the redirected
548
request but *after* having eventually raised an error.
550
# Some servers (incorrectly) return multiple Location headers
551
# (so probably same goes for URI). Use first header.
553
# TODO: Once we get rid of addinfourl objects, the
554
# following will need to be updated to use correct case
556
if 'location' in headers:
557
newurl = headers.getheaders('location')[0]
558
elif 'uri' in headers:
559
newurl = headers.getheaders('uri')[0]
562
if self._debuglevel > 0:
563
print 'Redirected to: %s' % newurl
564
newurl = urlparse.urljoin(req.get_full_url(), newurl)
566
# This call succeeds or raise an error. urllib2 returns
567
# if redirect_request returns None, but our
568
# redirect_request never returns None.
569
redirected_req = self.redirect_request(req, fp, code, msg, headers,
573
# .redirect_dict has a key url if url was previously visited.
574
if hasattr(req, 'redirect_dict'):
575
visited = redirected_req.redirect_dict = req.redirect_dict
576
if (visited.get(newurl, 0) >= self.max_repeats or
577
len(visited) >= self.max_redirections):
578
raise urllib2.HTTPError(req.get_full_url(), code,
579
self.inf_msg + msg, headers, fp)
581
visited = redirected_req.redirect_dict = req.redirect_dict = {}
582
visited[newurl] = visited.get(newurl, 0) + 1
584
# We can close the fp now that we are sure that we won't
585
# use it with HTTPError.
587
# We have all we need already in the response
588
req.connection.fake_close()
590
return self.parent.open(redirected_req)
592
http_error_302 = http_error_303 = http_error_307 = http_error_30x
594
def http_error_301(self, req, fp, code, msg, headers):
595
response = self.http_error_30x(req, fp, code, msg, headers)
596
# If one or several 301 response occur during the
597
# redirection chain, we MUST update the original request
598
# to indicate where the URI where finally found.
601
while original_req.parent is not None:
602
original_req = original_req.parent
603
if original_req.redirected_to is None:
604
# Only the last occurring 301 should be taken
605
# into account i.e. the first occurring here when
606
# redirected_to has not yet been set.
607
original_req.redirected_to = redirected_url
611
class HTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
612
"""Custom basic authentification handler.
614
Send the authentification preventively to avoid the the
615
roundtrip associated with the 401 error.
618
# def http_request(self, request):
619
# """Insert an authentification header if information is available"""
620
# if request.auth == 'basic' and request.password is not None:
625
class HTTPErrorProcessor(urllib2.HTTPErrorProcessor):
626
"""Process HTTP error responses.
628
We don't really process the errors, quite the contrary
629
instead, we leave our Transport handle them.
631
handler_order = 1000 # after all other processing
633
def http_response(self, request, response):
634
code, msg, hdrs = response.code, response.msg, response.info()
636
if code not in (200, # Ok
637
206, # Partial content
640
response = self.parent.error('http', request, response,
644
https_response = http_response
647
class HTTPDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
648
"""Translate common errors into bzr Exceptions"""
650
def http_error_default(self, req, fp, code, msg, hdrs):
652
raise errors.NoSuchFile(req.get_selector(),
653
extra=HTTPError(req.get_full_url(),
657
raise errors.TransportError('Server refuses to fullfil the request')
659
# TODO: A test is needed to exercise that code path
660
raise errors.InvalidHttpResponse(req.get_full_url(),
661
'Unable to handle http code %d: %s'
664
class Opener(object):
665
"""A wrapper around urllib2.build_opener
667
Daughter classes can override to build their own specific opener
669
# TODO: Provides hooks for daughter classes.
672
connection=ConnectionHandler,
673
redirect=HTTPRedirectHandler,
674
error=HTTPErrorProcessor,):
675
self.password_manager = PasswordManager()
676
# TODO: Implements the necessary wrappers for the handlers
677
# commented out below
678
self._opener = urllib2.build_opener( \
679
connection, redirect, error,
680
#urllib2.ProxyHandler,
681
urllib2.HTTPBasicAuthHandler(self.password_manager),
682
#urllib2.HTTPDigestAuthHandler(self.password_manager),
683
#urllib2.ProxyBasicAuthHandler,
684
#urllib2.ProxyDigestAuthHandler,
687
HTTPDefaultErrorHandler,
689
self.open = self._opener.open
691
# When dealing with handler order, it's easy to mess
692
# things up, the following will help understand which
693
# handler is used, when and for what.
695
pprint.pprint(self._opener.__dict__)