1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
TransportError, ConnectionError)
22
23
from cStringIO import StringIO
24
import urllib, urllib2
31
from warnings import warn
33
# TODO: load these only when running http tests
34
import BaseHTTPServer, SimpleHTTPServer, socket, time
37
from bzrlib import errors
38
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
39
TransportError, ConnectionError, InvalidURL)
27
from bzrlib.errors import BzrError, BzrCheckError
40
28
from bzrlib.branch import Branch
41
29
from bzrlib.trace import mutter
42
from bzrlib.transport import Transport, register_transport, Server
43
from bzrlib.transport.http.response import (HttpMultipartRangeResponse,
45
from bzrlib.ui import ui_factory
48
32
def extract_auth(url, password_manager):
49
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
34
Extract auth parameters from am HTTP/HTTPS url and add them to the given
50
35
password manager. Return the url, minus those auth parameters (which
53
assert re.match(r'^(https?)(\+\w+)?://', url), \
54
'invalid absolute url %r' % url
55
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
58
auth, netloc = netloc.split('@', 1)
38
assert url.startswith('http://') or url.startswith('https://')
39
scheme, host = url.split('//', 1)
41
host, path = host.split('/', 1)
47
auth, host = host.split('@', 1)
60
49
username, password = auth.split(':', 1)
62
51
username, password = auth, None
64
host = netloc.split(':', 1)[0]
67
username = urllib.unquote(username)
53
host, port = host.split(':', 1)
55
# FIXME: if password isn't given, should we ask for it?
68
56
if password is not None:
57
username = urllib.unquote(username)
69
58
password = urllib.unquote(password)
71
password = ui_factory.get_password(prompt='HTTP %(user)@%(host) password',
72
user=username, host=host)
73
password_manager.add_password(None, host, username, password)
74
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
59
password_manager.add_password(None, host, username, password)
60
url = scheme + '//' + host + port + path
78
def _extract_headers(header_text, url):
79
"""Extract the mapping for an rfc2822 header
81
This is a helper function for the test suite and for _pycurl.
82
(urllib already parses the headers for us)
84
In the case that there are multiple headers inside the file,
85
the last one is returned.
87
:param header_text: A string of header information.
88
This expects that the first line of a header will always be HTTP ...
89
:param url: The url we are parsing, so we can raise nice errors
90
:return: mimetools.Message object, which basically acts like a case
91
insensitive dictionary.
94
remaining = header_text
97
raise errors.InvalidHttpResponse(url, 'Empty headers')
100
header_file = StringIO(remaining)
101
first_line = header_file.readline()
102
if not first_line.startswith('HTTP'):
103
if first_header: # The first header *must* start with HTTP
104
raise errors.InvalidHttpResponse(url,
105
'Opening header line did not start with HTTP: %s'
107
assert False, 'Opening header line was not HTTP'
109
break # We are done parsing
111
m = mimetools.Message(header_file)
113
# mimetools.Message parses the first header up to a blank line
114
# So while there is remaining data, it probably means there is
115
# another header to be parsed.
116
# Get rid of any preceeding whitespace, which if it is all whitespace
117
# will get rid of everything.
118
remaining = header_file.read().lstrip()
122
class HttpTransportBase(Transport):
123
"""Base class for http implementations.
125
Does URL parsing, etc, but not any network IO.
127
The protocol can be given as e.g. http+urllib://host/ to use a particular
131
# _proto: "http" or "https"
132
# _qualified_proto: may have "+pycurl", etc
65
mutter("get_url %s" % url)
66
manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
67
url = extract_auth(url, manager)
68
auth_handler = urllib2.HTTPBasicAuthHandler(manager)
69
opener = urllib2.build_opener(auth_handler)
70
url_f = opener.open(url)
73
class HttpTransport(Transport):
74
"""This is the transport agent for http:// access.
76
TODO: Implement pipelined versions of all of the *_multi() functions.
134
79
def __init__(self, base):
135
80
"""Set the base path where files will be stored."""
136
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
138
raise AssertionError("not a http url: %r" % base)
139
self._proto = proto_match.group(1)
140
impl_name = proto_match.group(2)
142
impl_name = impl_name[1:]
143
self._impl_name = impl_name
146
super(HttpTransportBase, self).__init__(base)
81
assert base.startswith('http://') or base.startswith('https://')
82
super(HttpTransport, self).__init__(base)
147
83
# In the future we might actually connect to the remote host
148
84
# rather than using get_url
149
85
# self._connection = None
150
(apparent_proto, self._host,
86
(self._proto, self._host,
151
87
self._path, self._parameters,
152
88
self._query, self._fragment) = urlparse.urlparse(self.base)
153
self._qualified_proto = apparent_proto
90
def should_cache(self):
91
"""Return True if the data pulled across should be cached locally.
95
def clone(self, offset=None):
96
"""Return a new HttpTransport with root at self.base + offset
97
For now HttpTransport does not actually connect, so just return
98
a new HttpTransport object.
101
return HttpTransport(self.base)
103
return HttpTransport(self.abspath(offset))
155
105
def abspath(self, relpath):
156
106
"""Return the full url to the given relative path.
158
This can be supplied with a string or a list.
160
The URL returned always has the protocol scheme originally used to
161
construct the transport, even if that includes an explicit
162
implementation qualifier.
107
This can be supplied with a string or a list
164
109
assert isinstance(relpath, basestring)
165
if isinstance(relpath, unicode):
166
raise InvalidURL(relpath, 'paths must not be unicode.')
167
110
if isinstance(relpath, basestring):
168
111
relpath_parts = relpath.split('/')
194
137
# I'm concerned about when it chooses to strip the last
195
138
# portion of the path, and when it doesn't.
196
139
path = '/'.join(basepath)
199
result = urlparse.urlunparse((self._qualified_proto,
200
self._host, path, '', '', ''))
203
def _real_abspath(self, relpath):
204
"""Produce absolute path, adjusting protocol if needed"""
205
abspath = self.abspath(relpath)
206
qp = self._qualified_proto
208
if self._qualified_proto != self._proto:
209
abspath = rp + abspath[len(qp):]
210
if not isinstance(abspath, str):
211
# escaping must be done at a higher level
212
abspath = abspath.encode('ascii')
140
return urlparse.urlunparse((self._proto,
141
self._host, path, '', '', ''))
215
143
def has(self, relpath):
216
raise NotImplementedError("has() is abstract on %r" % self)
218
def get(self, relpath):
144
"""Does the target location exist?
146
TODO: HttpTransport.has() should use a HEAD request,
147
not a full GET request.
149
TODO: This should be changed so that we don't use
150
urllib2 and get an exception, the code path would be
151
cleaner if we just do an http HEAD request, and parse
156
path = self.abspath(relpath)
158
# Without the read and then close()
159
# we tend to have busy sockets.
163
except urllib2.URLError, e:
164
mutter('url error code: %s for has url: %r', e.code, path)
169
mutter('io error: %s %s for has url: %r',
170
e.errno, errno.errorcode.get(e.errno), path)
171
if e.errno == errno.ENOENT:
173
raise TransportError(orig_error=e)
175
def get(self, relpath, decode=False):
219
176
"""Get the file at the given relative path.
221
178
:param relpath: The relative path to the file
223
code, response_file = self._get(relpath, None)
226
def _get(self, relpath, ranges):
227
"""Get a file, or part of a file.
229
:param relpath: Path relative to transport base URL
230
:param byte_range: None to get the whole file;
231
or [(start,end)] to fetch parts of a file.
233
:returns: (http_code, result_file)
235
Note that the current http implementations can only fetch one range at
236
a time through this call.
238
raise NotImplementedError(self._get)
240
def readv(self, relpath, offsets):
241
"""Get parts of the file at the given relative path.
243
:param offsets: A list of (offset, size) tuples.
244
:param return: A list or generator of (offset, data) tuples
246
ranges = self.offsets_to_ranges(offsets)
247
mutter('http readv of %s collapsed %s offsets => %s',
248
relpath, len(offsets), ranges)
249
code, f = self._get(relpath, ranges)
250
for start, size in offsets:
251
f.seek(start, (start < 0) and 2 or 0)
254
assert len(data) == size
258
def offsets_to_ranges(offsets):
259
"""Turn a list of offsets and sizes into a list of byte ranges.
261
:param offsets: A list of tuples of (start, size). An empty list
263
:return: a list of inclusive byte ranges (start, end)
264
Adjacent ranges will be combined.
266
# Make sure we process sorted offsets
267
offsets = sorted(offsets)
272
for start, size in offsets:
273
end = start + size - 1
275
combined.append([start, end])
276
elif start <= prev_end + 1:
277
combined[-1][1] = end
279
combined.append([start, end])
182
path = self.abspath(relpath)
184
except urllib2.HTTPError, e:
185
mutter('url error code: %s for has url: %r', e.code, path)
187
raise NoSuchFile(path, extra=e)
189
except (BzrError, IOError), e:
190
if hasattr(e, 'errno'):
191
mutter('io error: %s %s for has url: %r',
192
e.errno, errno.errorcode.get(e.errno), path)
193
if e.errno == errno.ENOENT:
194
raise NoSuchFile(path, extra=e)
195
raise ConnectionError(msg = "Error retrieving %s: %s"
196
% (self.abspath(relpath), str(e)),
284
199
def put(self, relpath, f, mode=None):
285
200
"""Copy the file-like or string object into the location.
365
271
:return: A lock object, which should be passed to Transport.unlock()
367
273
raise TransportNotPossible('http does not support lock_write()')
369
def clone(self, offset=None):
370
"""Return a new HttpTransportBase with root at self.base + offset
371
For now HttpTransportBase does not actually connect, so just return
372
a new HttpTransportBase object.
375
return self.__class__(self.base)
377
return self.__class__(self.abspath(offset))
380
def range_header(ranges, tail_amount):
381
"""Turn a list of bytes ranges into a HTTP Range header value.
383
:param offsets: A list of byte ranges, (start, end). An empty list
386
:return: HTTP range header string.
389
for start, end in ranges:
390
strings.append('%d-%d' % (start, end))
393
strings.append('-%d' % tail_amount)
395
return ','.join(strings)
398
#---------------- test server facilities ----------------
399
# TODO: load these only when running tests
402
class WebserverNotAvailable(Exception):
406
class BadWebserverPath(ValueError):
408
return 'path %s is not in %s' % self.args
411
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
413
def log_message(self, format, *args):
414
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
415
self.address_string(),
416
self.log_date_time_string(),
418
self.headers.get('referer', '-'),
419
self.headers.get('user-agent', '-'))
421
def handle_one_request(self):
422
"""Handle a single HTTP request.
424
You normally don't need to override this method; see the class
425
__doc__ string for information on how to handle specific HTTP
426
commands such as GET and POST.
429
for i in xrange(1,11): # Don't try more than 10 times
431
self.raw_requestline = self.rfile.readline()
432
except socket.error, e:
433
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
434
# omitted for now because some tests look at the log of
435
# the server and expect to see no errors. see recent
436
# email thread. -- mbp 20051021.
437
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
443
if not self.raw_requestline:
444
self.close_connection = 1
446
if not self.parse_request(): # An error code has been sent, just exit
448
mname = 'do_' + self.command
449
if not hasattr(self, mname):
450
self.send_error(501, "Unsupported method (%r)" % self.command)
452
method = getattr(self, mname)
455
if sys.platform == 'win32':
456
# On win32 you cannot access non-ascii filenames without
457
# decoding them into unicode first.
458
# However, under Linux, you can access bytestream paths
459
# without any problems. If this function was always active
460
# it would probably break tests when LANG=C was set
461
def translate_path(self, path):
462
"""Translate a /-separated PATH to the local filename syntax.
464
For bzr, all url paths are considered to be utf8 paths.
465
On Linux, you can access these paths directly over the bytestream
466
request, but on win32, you must decode them, and access them
469
# abandon query parameters
470
path = urlparse.urlparse(path)[2]
471
path = posixpath.normpath(urllib.unquote(path))
472
path = path.decode('utf-8')
473
words = path.split('/')
474
words = filter(None, words)
477
drive, word = os.path.splitdrive(word)
478
head, word = os.path.split(word)
479
if word in (os.curdir, os.pardir): continue
480
path = os.path.join(path, word)
484
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
485
def __init__(self, server_address, RequestHandlerClass, test_case):
486
BaseHTTPServer.HTTPServer.__init__(self, server_address,
488
self.test_case = test_case
491
class HttpServer(Server):
492
"""A test server for http transports."""
494
# used to form the url that connects to this server
495
_url_protocol = 'http'
497
def _http_start(self):
499
httpd = TestingHTTPServer(('localhost', 0),
500
TestingHTTPRequestHandler,
502
host, port = httpd.socket.getsockname()
503
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
504
self._http_starting.release()
505
httpd.socket.settimeout(0.1)
507
while self._http_running:
509
httpd.handle_request()
510
except socket.timeout:
513
def _get_remote_url(self, path):
514
path_parts = path.split(os.path.sep)
515
if os.path.isabs(path):
516
if path_parts[:len(self._local_path_parts)] != \
517
self._local_path_parts:
518
raise BadWebserverPath(path, self.test_dir)
519
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
521
remote_path = '/'.join(path_parts)
523
self._http_starting.acquire()
524
self._http_starting.release()
525
return self._http_base_url + remote_path
527
def log(self, format, *args):
528
"""Capture Server log output."""
529
self.logs.append(format % args)
532
"""See bzrlib.transport.Server.setUp."""
533
self._home_dir = os.getcwdu()
534
self._local_path_parts = self._home_dir.split(os.path.sep)
535
self._http_starting = threading.Lock()
536
self._http_starting.acquire()
537
self._http_running = True
538
self._http_base_url = None
539
self._http_thread = threading.Thread(target=self._http_start)
540
self._http_thread.setDaemon(True)
541
self._http_thread.start()
542
self._http_proxy = os.environ.get("http_proxy")
543
if self._http_proxy is not None:
544
del os.environ["http_proxy"]
548
"""See bzrlib.transport.Server.tearDown."""
549
self._http_running = False
550
self._http_thread.join()
551
if self._http_proxy is not None:
553
os.environ["http_proxy"] = self._http_proxy
556
"""See bzrlib.transport.Server.get_url."""
557
return self._get_remote_url(self._home_dir)
559
def get_bogus_url(self):
560
"""See bzrlib.transport.Server.get_bogus_url."""
561
# this is chosen to try to prevent trouble with proxies, wierd dns,
563
return 'http://127.0.0.1:1/'