1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError)
22
23
from cStringIO import StringIO
31
from warnings import warn
33
# TODO: load these only when running http tests
34
import BaseHTTPServer, SimpleHTTPServer, socket, time
37
from bzrlib import errors
38
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
39
TransportError, ConnectionError, InvalidURL)
27
from bzrlib.errors import BzrError, BzrCheckError
40
28
from bzrlib.branch import Branch
41
29
from bzrlib.trace import mutter
42
from bzrlib.transport import Transport, register_transport, Server
43
from bzrlib.transport.http.response import (HttpMultipartRangeResponse,
45
from bzrlib.ui import ui_factory
48
def extract_auth(url, password_manager):
49
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
50
password manager. Return the url, minus those auth parameters (which
53
assert re.match(r'^(https?)(\+\w+)?://', url), \
54
'invalid absolute url %r' % url
55
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
31
# velocitynet.com.au transparently proxies connections and thereby
32
# breaks keep-alive -- sucks!
37
mutter("get_url %s" % url)
38
url_f = urllib2.urlopen(url)
41
class HttpTransportError(TransportError):
44
class HttpTransport(Transport):
45
"""This is the transport agent for http:// access.
58
auth, netloc = netloc.split('@', 1)
60
username, password = auth.split(':', 1)
62
username, password = auth, None
64
host = netloc.split(':', 1)[0]
67
username = urllib.unquote(username)
68
if password is not None:
69
password = urllib.unquote(password)
71
password = ui_factory.get_password(prompt='HTTP %(user)@%(host) password',
72
user=username, host=host)
73
password_manager.add_password(None, host, username, password)
74
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
78
def _extract_headers(header_text, url):
79
"""Extract the mapping for an rfc2822 header
81
This is a helper function for the test suite and for _pycurl.
82
(urllib already parses the headers for us)
84
In the case that there are multiple headers inside the file,
85
the last one is returned.
87
:param header_text: A string of header information.
88
This expects that the first line of a header will always be HTTP ...
89
:param url: The url we are parsing, so we can raise nice errors
90
:return: mimetools.Message object, which basically acts like a case
91
insensitive dictionary.
94
remaining = header_text
97
raise errors.InvalidHttpResponse(url, 'Empty headers')
100
header_file = StringIO(remaining)
101
first_line = header_file.readline()
102
if not first_line.startswith('HTTP'):
103
if first_header: # The first header *must* start with HTTP
104
raise errors.InvalidHttpResponse(url,
105
'Opening header line did not start with HTTP: %s'
107
assert False, 'Opening header line was not HTTP'
109
break # We are done parsing
111
m = mimetools.Message(header_file)
113
# mimetools.Message parses the first header up to a blank line
114
# So while there is remaining data, it probably means there is
115
# another header to be parsed.
116
# Get rid of any preceeding whitespace, which if it is all whitespace
117
# will get rid of everything.
118
remaining = header_file.read().lstrip()
122
class HttpTransportBase(Transport):
123
"""Base class for http implementations.
125
Does URL parsing, etc, but not any network IO.
127
The protocol can be given as e.g. http+urllib://host/ to use a particular
131
# _proto: "http" or "https"
132
# _qualified_proto: may have "+pycurl", etc
47
TODO: Implement pipelined versions of all of the *_multi() functions.
134
50
def __init__(self, base):
135
51
"""Set the base path where files will be stored."""
136
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
138
raise AssertionError("not a http url: %r" % base)
139
self._proto = proto_match.group(1)
140
impl_name = proto_match.group(2)
142
impl_name = impl_name[1:]
143
self._impl_name = impl_name
146
super(HttpTransportBase, self).__init__(base)
52
assert base.startswith('http://') or base.startswith('https://')
53
super(HttpTransport, self).__init__(base)
147
54
# In the future we might actually connect to the remote host
148
55
# rather than using get_url
149
56
# self._connection = None
150
(apparent_proto, self._host,
57
(self._proto, self._host,
151
58
self._path, self._parameters,
152
59
self._query, self._fragment) = urlparse.urlparse(self.base)
153
self._qualified_proto = apparent_proto
61
def should_cache(self):
62
"""Return True if the data pulled across should be cached locally.
66
def clone(self, offset=None):
67
"""Return a new HttpTransport with root at self.base + offset
68
For now HttpTransport does not actually connect, so just return
69
a new HttpTransport object.
72
return HttpTransport(self.base)
74
return HttpTransport(self.abspath(offset))
155
76
def abspath(self, relpath):
156
77
"""Return the full url to the given relative path.
158
This can be supplied with a string or a list.
160
The URL returned always has the protocol scheme originally used to
161
construct the transport, even if that includes an explicit
162
implementation qualifier.
78
This can be supplied with a string or a list
164
assert isinstance(relpath, basestring)
165
if isinstance(relpath, unicode):
166
raise InvalidURL(relpath, 'paths must not be unicode.')
167
80
if isinstance(relpath, basestring):
168
relpath_parts = relpath.split('/')
170
# TODO: Don't call this with an array - no magic interfaces
171
relpath_parts = relpath[:]
172
if len(relpath_parts) > 1:
173
if relpath_parts[0] == '':
174
raise ValueError("path %r within branch %r seems to be absolute"
175
% (relpath, self._path))
176
if relpath_parts[-1] == '':
177
raise ValueError("path %r within branch %r seems to be a directory"
178
% (relpath, self._path))
179
82
basepath = self._path.split('/')
180
83
if len(basepath) > 0 and basepath[-1] == '':
181
84
basepath = basepath[:-1]
182
for p in relpath_parts:
184
if len(basepath) == 0:
185
89
# In most filesystems, a request for the parent
186
90
# of root, just returns root.
189
elif p == '.' or p == '':
192
97
basepath.append(p)
193
99
# Possibly, we could use urlparse.urljoin() here, but
194
100
# I'm concerned about when it chooses to strip the last
195
101
# portion of the path, and when it doesn't.
196
102
path = '/'.join(basepath)
199
result = urlparse.urlunparse((self._qualified_proto,
200
self._host, path, '', '', ''))
103
return urlparse.urlunparse((self._proto,
104
self._host, path, '', '', ''))
203
def _real_abspath(self, relpath):
204
"""Produce absolute path, adjusting protocol if needed"""
205
abspath = self.abspath(relpath)
206
qp = self._qualified_proto
208
if self._qualified_proto != self._proto:
209
abspath = rp + abspath[len(qp):]
210
if not isinstance(abspath, str):
211
# escaping must be done at a higher level
212
abspath = abspath.encode('ascii')
106
def relpath(self, abspath):
107
if not abspath.startswith(self.base):
108
raise NonRelativePath('path %r is not under base URL %r'
109
% (abspath, self.base))
111
return abspath[pl:].lstrip('/')
215
113
def has(self, relpath):
216
raise NotImplementedError("has() is abstract on %r" % self)
218
def get(self, relpath):
114
"""Does the target location exist?
116
TODO: HttpTransport.has() should use a HEAD request,
117
not a full GET request.
119
TODO: This should be changed so that we don't use
120
urllib2 and get an exception, the code path would be
121
cleaner if we just do an http HEAD request, and parse
125
f = get_url(self.abspath(relpath))
126
# Without the read and then close()
127
# we tend to have busy sockets.
133
except urllib2.URLError:
136
if e.errno == errno.ENOENT:
138
raise HttpTransportError(orig_error=e)
140
def get(self, relpath, decode=False):
219
141
"""Get the file at the given relative path.
221
143
:param relpath: The relative path to the file
223
code, response_file = self._get(relpath, None)
226
def _get(self, relpath, ranges):
227
"""Get a file, or part of a file.
229
:param relpath: Path relative to transport base URL
230
:param byte_range: None to get the whole file;
231
or [(start,end)] to fetch parts of a file.
233
:returns: (http_code, result_file)
235
Note that the current http implementations can only fetch one range at
236
a time through this call.
238
raise NotImplementedError(self._get)
240
def readv(self, relpath, offsets):
241
"""Get parts of the file at the given relative path.
243
:param offsets: A list of (offset, size) tuples.
244
:param return: A list or generator of (offset, data) tuples
246
ranges = self.offsets_to_ranges(offsets)
247
mutter('http readv of %s collapsed %s offsets => %s',
248
relpath, len(offsets), ranges)
249
code, f = self._get(relpath, ranges)
250
for start, size in offsets:
251
f.seek(start, (start < 0) and 2 or 0)
254
assert len(data) == size
258
def offsets_to_ranges(offsets):
259
"""Turn a list of offsets and sizes into a list of byte ranges.
261
:param offsets: A list of tuples of (start, size). An empty list
263
:return: a list of inclusive byte ranges (start, end)
264
Adjacent ranges will be combined.
266
# Make sure we process sorted offsets
267
offsets = sorted(offsets)
272
for start, size in offsets:
273
end = start + size - 1
275
combined.append([start, end])
276
elif start <= prev_end + 1:
277
combined[-1][1] = end
279
combined.append([start, end])
284
def put(self, relpath, f, mode=None):
146
return get_url(self.abspath(relpath))
147
except (BzrError, urllib2.URLError, IOError), e:
148
raise NoSuchFile(msg = "Error retrieving %s"
149
% self.abspath(relpath),
152
def get_partial(self, relpath, start, length=None):
153
"""Get just part of a file.
155
:param relpath: Path to the file, relative to base
156
:param start: The starting position to read from
157
:param length: The length to read. A length of None indicates
158
read to the end of the file.
159
:return: A file-like object containing at least the specified bytes.
160
Some implementations may return objects which can be read
161
past this length, but this is not guaranteed.
163
# TODO: You can make specialized http requests for just
164
# a portion of the file. Figure out how to do that.
165
# For now, urllib2 returns files that cannot seek() so
166
# we just read bytes off the beginning, until we
167
# get to the point that we care about.
168
f = self.get(relpath)
169
# TODO: read in smaller chunks, in case things are
170
# buffered internally.
174
def put(self, relpath, f):
285
175
"""Copy the file-like or string object into the location.
287
177
:param relpath: Location to put the contents, relative to base.
367
248
raise TransportNotPossible('http does not support lock_write()')
369
def clone(self, offset=None):
370
"""Return a new HttpTransportBase with root at self.base + offset
371
For now HttpTransportBase does not actually connect, so just return
372
a new HttpTransportBase object.
375
return self.__class__(self.base)
377
return self.__class__(self.abspath(offset))
380
def range_header(ranges, tail_amount):
381
"""Turn a list of bytes ranges into a HTTP Range header value.
383
:param offsets: A list of byte ranges, (start, end). An empty list
386
:return: HTTP range header string.
389
for start, end in ranges:
390
strings.append('%d-%d' % (start, end))
393
strings.append('-%d' % tail_amount)
395
return ','.join(strings)
398
#---------------- test server facilities ----------------
399
# TODO: load these only when running tests
402
class WebserverNotAvailable(Exception):
406
class BadWebserverPath(ValueError):
408
return 'path %s is not in %s' % self.args
411
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
413
def log_message(self, format, *args):
414
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
415
self.address_string(),
416
self.log_date_time_string(),
418
self.headers.get('referer', '-'),
419
self.headers.get('user-agent', '-'))
421
def handle_one_request(self):
422
"""Handle a single HTTP request.
424
You normally don't need to override this method; see the class
425
__doc__ string for information on how to handle specific HTTP
426
commands such as GET and POST.
429
for i in xrange(1,11): # Don't try more than 10 times
431
self.raw_requestline = self.rfile.readline()
432
except socket.error, e:
433
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
434
# omitted for now because some tests look at the log of
435
# the server and expect to see no errors. see recent
436
# email thread. -- mbp 20051021.
437
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
443
if not self.raw_requestline:
444
self.close_connection = 1
446
if not self.parse_request(): # An error code has been sent, just exit
448
mname = 'do_' + self.command
449
if not hasattr(self, mname):
450
self.send_error(501, "Unsupported method (%r)" % self.command)
452
method = getattr(self, mname)
455
if sys.platform == 'win32':
456
# On win32 you cannot access non-ascii filenames without
457
# decoding them into unicode first.
458
# However, under Linux, you can access bytestream paths
459
# without any problems. If this function was always active
460
# it would probably break tests when LANG=C was set
461
def translate_path(self, path):
462
"""Translate a /-separated PATH to the local filename syntax.
464
For bzr, all url paths are considered to be utf8 paths.
465
On Linux, you can access these paths directly over the bytestream
466
request, but on win32, you must decode them, and access them
469
# abandon query parameters
470
path = urlparse.urlparse(path)[2]
471
path = posixpath.normpath(urllib.unquote(path))
472
path = path.decode('utf-8')
473
words = path.split('/')
474
words = filter(None, words)
477
drive, word = os.path.splitdrive(word)
478
head, word = os.path.split(word)
479
if word in (os.curdir, os.pardir): continue
480
path = os.path.join(path, word)
484
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
485
def __init__(self, server_address, RequestHandlerClass, test_case):
486
BaseHTTPServer.HTTPServer.__init__(self, server_address,
488
self.test_case = test_case
491
class HttpServer(Server):
492
"""A test server for http transports."""
494
# used to form the url that connects to this server
495
_url_protocol = 'http'
497
def _http_start(self):
499
httpd = TestingHTTPServer(('localhost', 0),
500
TestingHTTPRequestHandler,
502
host, port = httpd.socket.getsockname()
503
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
504
self._http_starting.release()
505
httpd.socket.settimeout(0.1)
507
while self._http_running:
509
httpd.handle_request()
510
except socket.timeout:
513
def _get_remote_url(self, path):
514
path_parts = path.split(os.path.sep)
515
if os.path.isabs(path):
516
if path_parts[:len(self._local_path_parts)] != \
517
self._local_path_parts:
518
raise BadWebserverPath(path, self.test_dir)
519
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
521
remote_path = '/'.join(path_parts)
523
self._http_starting.acquire()
524
self._http_starting.release()
525
return self._http_base_url + remote_path
527
def log(self, format, *args):
528
"""Capture Server log output."""
529
self.logs.append(format % args)
532
"""See bzrlib.transport.Server.setUp."""
533
self._home_dir = os.getcwdu()
534
self._local_path_parts = self._home_dir.split(os.path.sep)
535
self._http_starting = threading.Lock()
536
self._http_starting.acquire()
537
self._http_running = True
538
self._http_base_url = None
539
self._http_thread = threading.Thread(target=self._http_start)
540
self._http_thread.setDaemon(True)
541
self._http_thread.start()
542
self._http_proxy = os.environ.get("http_proxy")
543
if self._http_proxy is not None:
544
del os.environ["http_proxy"]
548
"""See bzrlib.transport.Server.tearDown."""
549
self._http_running = False
550
self._http_thread.join()
551
if self._http_proxy is not None:
553
os.environ["http_proxy"] = self._http_proxy
556
"""See bzrlib.transport.Server.get_url."""
557
return self._get_remote_url(self._home_dir)
559
def get_bogus_url(self):
560
"""See bzrlib.transport.Server.get_bogus_url."""
561
# this is chosen to try to prevent trouble with proxies, wierd dns,
563
return 'http://127.0.0.1:1/'
250
register_transport('http://', HttpTransport)
251
register_transport('https://', HttpTransport)