1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError, ConnectionError)
22
23
from cStringIO import StringIO
31
from warnings import warn
33
# TODO: load these only when running http tests
34
import BaseHTTPServer, SimpleHTTPServer, socket, time
37
from bzrlib import errors
38
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
39
TransportError, ConnectionError, InvalidURL)
27
from bzrlib.errors import BzrError, BzrCheckError
40
28
from bzrlib.branch import Branch
41
29
from bzrlib.trace import mutter
42
from bzrlib.transport import Transport, register_transport, Server
43
from bzrlib.transport.http.response import (HttpMultipartRangeResponse,
45
from bzrlib.ui import ui_factory
48
def extract_auth(url, password_manager):
49
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
50
password manager. Return the url, minus those auth parameters (which
53
assert re.match(r'^(https?)(\+\w+)?://', url), \
54
'invalid absolute url %r' % url
55
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
34
mutter("get_url %s", url)
35
url_f = urllib2.urlopen(url)
38
class HttpTransportError(TransportError):
41
class HttpTransport(Transport):
42
"""This is the transport agent for http:// access.
58
auth, netloc = netloc.split('@', 1)
60
username, password = auth.split(':', 1)
62
username, password = auth, None
64
host = netloc.split(':', 1)[0]
67
username = urllib.unquote(username)
68
if password is not None:
69
password = urllib.unquote(password)
71
password = ui_factory.get_password(prompt='HTTP %(user)@%(host) password',
72
user=username, host=host)
73
password_manager.add_password(None, host, username, password)
74
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
78
def _extract_headers(header_text, url):
79
"""Extract the mapping for an rfc2822 header
81
This is a helper function for the test suite and for _pycurl.
82
(urllib already parses the headers for us)
84
In the case that there are multiple headers inside the file,
85
the last one is returned.
87
:param header_text: A string of header information.
88
This expects that the first line of a header will always be HTTP ...
89
:param url: The url we are parsing, so we can raise nice errors
90
:return: mimetools.Message object, which basically acts like a case
91
insensitive dictionary.
94
remaining = header_text
97
raise errors.InvalidHttpResponse(url, 'Empty headers')
100
header_file = StringIO(remaining)
101
first_line = header_file.readline()
102
if not first_line.startswith('HTTP'):
103
if first_header: # The first header *must* start with HTTP
104
raise errors.InvalidHttpResponse(url,
105
'Opening header line did not start with HTTP: %s'
107
assert False, 'Opening header line was not HTTP'
109
break # We are done parsing
111
m = mimetools.Message(header_file)
113
# mimetools.Message parses the first header up to a blank line
114
# So while there is remaining data, it probably means there is
115
# another header to be parsed.
116
# Get rid of any preceeding whitespace, which if it is all whitespace
117
# will get rid of everything.
118
remaining = header_file.read().lstrip()
122
class HttpTransportBase(Transport):
123
"""Base class for http implementations.
125
Does URL parsing, etc, but not any network IO.
127
The protocol can be given as e.g. http+urllib://host/ to use a particular
131
# _proto: "http" or "https"
132
# _qualified_proto: may have "+pycurl", etc
44
TODO: Implement pipelined versions of all of the *_multi() functions.
134
47
def __init__(self, base):
135
48
"""Set the base path where files will be stored."""
136
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
138
raise AssertionError("not a http url: %r" % base)
139
self._proto = proto_match.group(1)
140
impl_name = proto_match.group(2)
142
impl_name = impl_name[1:]
143
self._impl_name = impl_name
146
super(HttpTransportBase, self).__init__(base)
49
assert base.startswith('http://') or base.startswith('https://')
50
super(HttpTransport, self).__init__(base)
147
51
# In the future we might actually connect to the remote host
148
52
# rather than using get_url
149
53
# self._connection = None
150
(apparent_proto, self._host,
54
(self._proto, self._host,
151
55
self._path, self._parameters,
152
56
self._query, self._fragment) = urlparse.urlparse(self.base)
153
self._qualified_proto = apparent_proto
58
def should_cache(self):
59
"""Return True if the data pulled across should be cached locally.
63
def clone(self, offset=None):
64
"""Return a new HttpTransport with root at self.base + offset
65
For now HttpTransport does not actually connect, so just return
66
a new HttpTransport object.
69
return HttpTransport(self.base)
71
return HttpTransport(self.abspath(offset))
155
73
def abspath(self, relpath):
156
74
"""Return the full url to the given relative path.
158
This can be supplied with a string or a list.
160
The URL returned always has the protocol scheme originally used to
161
construct the transport, even if that includes an explicit
162
implementation qualifier.
75
This can be supplied with a string or a list
164
77
assert isinstance(relpath, basestring)
165
if isinstance(relpath, unicode):
166
raise InvalidURL(relpath, 'paths must not be unicode.')
167
78
if isinstance(relpath, basestring):
168
79
relpath_parts = relpath.split('/')
170
81
# TODO: Don't call this with an array - no magic interfaces
171
82
relpath_parts = relpath[:]
172
if relpath.startswith('/'):
175
# Except for the root, no trailing slashes are allowed
176
if len(relpath_parts) > 1 and relpath_parts[-1] == '':
83
if len(relpath_parts) > 1:
84
if relpath_parts[0] == '':
85
raise ValueError("path %r within branch %r seems to be absolute"
86
% (relpath, self._path))
87
if relpath_parts[-1] == '':
177
88
raise ValueError("path %r within branch %r seems to be a directory"
178
89
% (relpath, self._path))
179
basepath = self._path.split('/')
180
if len(basepath) > 0 and basepath[-1] == '':
181
basepath = basepath[:-1]
90
basepath = self._path.split('/')
91
if len(basepath) > 0 and basepath[-1] == '':
92
basepath = basepath[:-1]
183
93
for p in relpath_parts:
185
95
if len(basepath) == 0:
195
105
# I'm concerned about when it chooses to strip the last
196
106
# portion of the path, and when it doesn't.
197
107
path = '/'.join(basepath)
200
result = urlparse.urlunparse((self._qualified_proto,
201
self._host, path, '', '', ''))
204
def _real_abspath(self, relpath):
205
"""Produce absolute path, adjusting protocol if needed"""
206
abspath = self.abspath(relpath)
207
qp = self._qualified_proto
209
if self._qualified_proto != self._proto:
210
abspath = rp + abspath[len(qp):]
211
if not isinstance(abspath, str):
212
# escaping must be done at a higher level
213
abspath = abspath.encode('ascii')
108
return urlparse.urlunparse((self._proto,
109
self._host, path, '', '', ''))
216
111
def has(self, relpath):
217
raise NotImplementedError("has() is abstract on %r" % self)
219
def get(self, relpath):
112
"""Does the target location exist?
114
TODO: HttpTransport.has() should use a HEAD request,
115
not a full GET request.
117
TODO: This should be changed so that we don't use
118
urllib2 and get an exception, the code path would be
119
cleaner if we just do an http HEAD request, and parse
123
f = get_url(self.abspath(relpath))
124
# Without the read and then close()
125
# we tend to have busy sockets.
129
except urllib2.URLError, e:
134
if e.errno == errno.ENOENT:
136
raise HttpTransportError(orig_error=e)
138
def get(self, relpath, decode=False):
220
139
"""Get the file at the given relative path.
222
141
:param relpath: The relative path to the file
224
code, response_file = self._get(relpath, None)
227
def _get(self, relpath, ranges):
228
"""Get a file, or part of a file.
230
:param relpath: Path relative to transport base URL
231
:param byte_range: None to get the whole file;
232
or [(start,end)] to fetch parts of a file.
234
:returns: (http_code, result_file)
236
Note that the current http implementations can only fetch one range at
237
a time through this call.
239
raise NotImplementedError(self._get)
241
def readv(self, relpath, offsets):
242
"""Get parts of the file at the given relative path.
244
:param offsets: A list of (offset, size) tuples.
245
:param return: A list or generator of (offset, data) tuples
247
ranges = self.offsets_to_ranges(offsets)
248
mutter('http readv of %s collapsed %s offsets => %s',
249
relpath, len(offsets), ranges)
250
code, f = self._get(relpath, ranges)
251
for start, size in offsets:
252
f.seek(start, (start < 0) and 2 or 0)
255
if len(data) != size:
256
raise errors.ShortReadvError(relpath, start, size,
261
def offsets_to_ranges(offsets):
262
"""Turn a list of offsets and sizes into a list of byte ranges.
264
:param offsets: A list of tuples of (start, size). An empty list
266
:return: a list of inclusive byte ranges (start, end)
267
Adjacent ranges will be combined.
269
# Make sure we process sorted offsets
270
offsets = sorted(offsets)
275
for start, size in offsets:
276
end = start + size - 1
278
combined.append([start, end])
279
elif start <= prev_end + 1:
280
combined[-1][1] = end
282
combined.append([start, end])
287
def put_file(self, relpath, f, mode=None):
288
"""Copy the file-like object into the location.
144
return get_url(self.abspath(relpath))
145
except urllib2.HTTPError, e:
147
raise NoSuchFile(msg = "Error retrieving %s: %s"
148
% (self.abspath(relpath), str(e)),
151
except (BzrError, IOError), e:
152
raise ConnectionError(msg = "Error retrieving %s: %s"
153
% (self.abspath(relpath), str(e)),
156
def put(self, relpath, f):
157
"""Copy the file-like or string object into the location.
290
159
:param relpath: Location to put the contents, relative to base.
291
:param f: File-like object.
160
:param f: File-like or string object.
293
162
raise TransportNotPossible('http PUT not supported')
295
def mkdir(self, relpath, mode=None):
164
def mkdir(self, relpath):
296
165
"""Create a directory at the given path."""
297
166
raise TransportNotPossible('http does not support mkdir()')
299
def rmdir(self, relpath):
300
"""See Transport.rmdir."""
301
raise TransportNotPossible('http does not support rmdir()')
303
def append_file(self, relpath, f, mode=None):
168
def append(self, relpath, f):
304
169
"""Append the text in the file-like object into the final
368
228
:return: A lock object, which should be passed to Transport.unlock()
370
230
raise TransportNotPossible('http does not support lock_write()')
372
def clone(self, offset=None):
373
"""Return a new HttpTransportBase with root at self.base + offset
374
For now HttpTransportBase does not actually connect, so just return
375
a new HttpTransportBase object.
378
return self.__class__(self.base)
380
return self.__class__(self.abspath(offset))
383
def range_header(ranges, tail_amount):
384
"""Turn a list of bytes ranges into a HTTP Range header value.
386
:param offsets: A list of byte ranges, (start, end). An empty list
389
:return: HTTP range header string.
392
for start, end in ranges:
393
strings.append('%d-%d' % (start, end))
396
strings.append('-%d' % tail_amount)
398
return ','.join(strings)
401
#---------------- test server facilities ----------------
402
# TODO: load these only when running tests
405
class WebserverNotAvailable(Exception):
409
class BadWebserverPath(ValueError):
411
return 'path %s is not in %s' % self.args
414
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
416
def log_message(self, format, *args):
417
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
418
self.address_string(),
419
self.log_date_time_string(),
421
self.headers.get('referer', '-'),
422
self.headers.get('user-agent', '-'))
424
def handle_one_request(self):
425
"""Handle a single HTTP request.
427
You normally don't need to override this method; see the class
428
__doc__ string for information on how to handle specific HTTP
429
commands such as GET and POST.
432
for i in xrange(1,11): # Don't try more than 10 times
434
self.raw_requestline = self.rfile.readline()
435
except socket.error, e:
436
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
437
# omitted for now because some tests look at the log of
438
# the server and expect to see no errors. see recent
439
# email thread. -- mbp 20051021.
440
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
446
if not self.raw_requestline:
447
self.close_connection = 1
449
if not self.parse_request(): # An error code has been sent, just exit
451
mname = 'do_' + self.command
452
if getattr(self, mname, None) is None:
453
self.send_error(501, "Unsupported method (%r)" % self.command)
455
method = getattr(self, mname)
458
if sys.platform == 'win32':
459
# On win32 you cannot access non-ascii filenames without
460
# decoding them into unicode first.
461
# However, under Linux, you can access bytestream paths
462
# without any problems. If this function was always active
463
# it would probably break tests when LANG=C was set
464
def translate_path(self, path):
465
"""Translate a /-separated PATH to the local filename syntax.
467
For bzr, all url paths are considered to be utf8 paths.
468
On Linux, you can access these paths directly over the bytestream
469
request, but on win32, you must decode them, and access them
472
# abandon query parameters
473
path = urlparse.urlparse(path)[2]
474
path = posixpath.normpath(urllib.unquote(path))
475
path = path.decode('utf-8')
476
words = path.split('/')
477
words = filter(None, words)
480
drive, word = os.path.splitdrive(word)
481
head, word = os.path.split(word)
482
if word in (os.curdir, os.pardir): continue
483
path = os.path.join(path, word)
487
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
488
def __init__(self, server_address, RequestHandlerClass, test_case):
489
BaseHTTPServer.HTTPServer.__init__(self, server_address,
491
self.test_case = test_case
494
class HttpServer(Server):
495
"""A test server for http transports."""
497
# used to form the url that connects to this server
498
_url_protocol = 'http'
500
# Subclasses can provide a specific request handler
501
def __init__(self, request_handler=TestingHTTPRequestHandler):
502
Server.__init__(self)
503
self.request_handler = request_handler
505
def _http_start(self):
507
httpd = TestingHTTPServer(('localhost', 0),
508
self.request_handler,
510
host, port = httpd.socket.getsockname()
511
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
512
self._http_starting.release()
513
httpd.socket.settimeout(0.1)
515
while self._http_running:
517
httpd.handle_request()
518
except socket.timeout:
521
def _get_remote_url(self, path):
522
path_parts = path.split(os.path.sep)
523
if os.path.isabs(path):
524
if path_parts[:len(self._local_path_parts)] != \
525
self._local_path_parts:
526
raise BadWebserverPath(path, self.test_dir)
527
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
529
remote_path = '/'.join(path_parts)
531
self._http_starting.acquire()
532
self._http_starting.release()
533
return self._http_base_url + remote_path
535
def log(self, format, *args):
536
"""Capture Server log output."""
537
self.logs.append(format % args)
540
"""See bzrlib.transport.Server.setUp."""
541
self._home_dir = os.getcwdu()
542
self._local_path_parts = self._home_dir.split(os.path.sep)
543
self._http_starting = threading.Lock()
544
self._http_starting.acquire()
545
self._http_running = True
546
self._http_base_url = None
547
self._http_thread = threading.Thread(target=self._http_start)
548
self._http_thread.setDaemon(True)
549
self._http_thread.start()
550
self._http_proxy = os.environ.get("http_proxy")
551
if self._http_proxy is not None:
552
del os.environ["http_proxy"]
556
"""See bzrlib.transport.Server.tearDown."""
557
self._http_running = False
558
self._http_thread.join()
559
if self._http_proxy is not None:
561
os.environ["http_proxy"] = self._http_proxy
564
"""See bzrlib.transport.Server.get_url."""
565
return self._get_remote_url(self._home_dir)
567
def get_bogus_url(self):
568
"""See bzrlib.transport.Server.get_bogus_url."""
569
# this is chosen to try to prevent trouble with proxies, weird dns,
571
return 'http://127.0.0.1:1/'