1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
22
from collections import deque
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError)
23
23
from cStringIO import StringIO
31
from warnings import warn
33
from bzrlib.transport import Transport, register_transport, Server
34
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
35
TransportError, ConnectionError, InvalidURL)
27
from bzrlib.errors import BzrError, BzrCheckError
36
28
from bzrlib.branch import Branch
37
29
from bzrlib.trace import mutter
38
# TODO: load these only when running http tests
39
import BaseHTTPServer, SimpleHTTPServer, socket, time
41
from bzrlib.ui import ui_factory
44
def extract_auth(url, password_manager):
45
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
46
password manager. Return the url, minus those auth parameters (which
49
assert re.match(r'^(https?)(\+\w+)?://', url), \
50
'invalid absolute url %r' % url
51
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
31
# velocitynet.com.au transparently proxies connections and thereby
32
# breaks keep-alive -- sucks!
37
mutter("get_url %s" % url)
38
url_f = urllib2.urlopen(url)
41
class HttpTransportError(TransportError):
44
class HttpTransport(Transport):
45
"""This is the transport agent for http:// access.
54
auth, netloc = netloc.split('@', 1)
56
username, password = auth.split(':', 1)
58
username, password = auth, None
60
host = netloc.split(':', 1)[0]
63
username = urllib.unquote(username)
64
if password is not None:
65
password = urllib.unquote(password)
67
password = ui_factory.get_password(prompt='HTTP %(user)@%(host) password',
68
user=username, host=host)
69
password_manager.add_password(None, host, username, password)
70
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
74
class HttpTransportBase(Transport):
75
"""Base class for http implementations.
77
Does URL parsing, etc, but not any network IO.
79
The protocol can be given as e.g. http+urllib://host/ to use a particular
47
TODO: Implement pipelined versions of all of the *_multi() functions.
83
# _proto: "http" or "https"
84
# _qualified_proto: may have "+pycurl", etc
86
50
def __init__(self, base):
87
51
"""Set the base path where files will be stored."""
88
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
90
raise AssertionError("not a http url: %r" % base)
91
self._proto = proto_match.group(1)
92
impl_name = proto_match.group(2)
94
impl_name = impl_name[1:]
95
self._impl_name = impl_name
98
super(HttpTransportBase, self).__init__(base)
52
assert base.startswith('http://') or base.startswith('https://')
53
super(HttpTransport, self).__init__(base)
99
54
# In the future we might actually connect to the remote host
100
55
# rather than using get_url
101
56
# self._connection = None
102
(apparent_proto, self._host,
57
(self._proto, self._host,
103
58
self._path, self._parameters,
104
59
self._query, self._fragment) = urlparse.urlparse(self.base)
105
self._qualified_proto = apparent_proto
61
def should_cache(self):
62
"""Return True if the data pulled across should be cached locally.
66
def clone(self, offset=None):
67
"""Return a new HttpTransport with root at self.base + offset
68
For now HttpTransport does not actually connect, so just return
69
a new HttpTransport object.
72
return HttpTransport(self.base)
74
return HttpTransport(self.abspath(offset))
107
76
def abspath(self, relpath):
108
77
"""Return the full url to the given relative path.
110
This can be supplied with a string or a list.
112
The URL returned always has the protocol scheme originally used to
113
construct the transport, even if that includes an explicit
114
implementation qualifier.
78
This can be supplied with a string or a list
116
assert isinstance(relpath, basestring)
117
if isinstance(relpath, unicode):
118
raise InvalidURL(relpath, 'paths must not be unicode.')
119
80
if isinstance(relpath, basestring):
120
relpath_parts = relpath.split('/')
122
# TODO: Don't call this with an array - no magic interfaces
123
relpath_parts = relpath[:]
124
if len(relpath_parts) > 1:
125
if relpath_parts[0] == '':
126
raise ValueError("path %r within branch %r seems to be absolute"
127
% (relpath, self._path))
128
if relpath_parts[-1] == '':
129
raise ValueError("path %r within branch %r seems to be a directory"
130
% (relpath, self._path))
131
82
basepath = self._path.split('/')
132
83
if len(basepath) > 0 and basepath[-1] == '':
133
84
basepath = basepath[:-1]
134
for p in relpath_parts:
136
if len(basepath) == 0:
137
89
# In most filesystems, a request for the parent
138
90
# of root, just returns root.
141
elif p == '.' or p == '':
144
96
basepath.append(p)
145
98
# Possibly, we could use urlparse.urljoin() here, but
146
99
# I'm concerned about when it chooses to strip the last
147
100
# portion of the path, and when it doesn't.
148
101
path = '/'.join(basepath)
151
result = urlparse.urlunparse((self._qualified_proto,
152
self._host, path, '', '', ''))
102
return urlparse.urlunparse((self._proto,
103
self._host, path, '', '', ''))
155
def _real_abspath(self, relpath):
156
"""Produce absolute path, adjusting protocol if needed"""
157
abspath = self.abspath(relpath)
158
qp = self._qualified_proto
160
if self._qualified_proto != self._proto:
161
abspath = rp + abspath[len(qp):]
162
if not isinstance(abspath, str):
163
# escaping must be done at a higher level
164
abspath = abspath.encode('ascii')
105
def relpath(self, abspath):
106
if not abspath.startswith(self.base):
107
raise NonRelativePath('path %r is not under base URL %r'
108
% (abspath, self.base))
110
return abspath[pl:].lstrip('/')
167
112
def has(self, relpath):
168
raise NotImplementedError("has() is abstract on %r" % self)
170
def get(self, relpath):
113
"""Does the target location exist?
115
TODO: HttpTransport.has() should use a HEAD request,
116
not a full GET request.
118
TODO: This should be changed so that we don't use
119
urllib2 and get an exception, the code path would be
120
cleaner if we just do an http HEAD request, and parse
124
f = get_url(self.abspath(relpath))
125
# Without the read and then close()
126
# we tend to have busy sockets.
132
except urllib2.URLError:
135
if e.errno == errno.ENOENT:
137
raise HttpTransportError(orig_error=e)
139
def get(self, relpath, decode=False):
171
140
"""Get the file at the given relative path.
173
142
:param relpath: The relative path to the file
175
code, response_file = self._get(relpath, None)
178
def _get(self, relpath, ranges):
179
"""Get a file, or part of a file.
181
:param relpath: Path relative to transport base URL
182
:param byte_range: None to get the whole file;
183
or [(start,end)] to fetch parts of a file.
185
:returns: (http_code, result_file)
187
Note that the current http implementations can only fetch one range at
188
a time through this call.
190
raise NotImplementedError(self._get)
192
def readv(self, relpath, offsets):
193
"""Get parts of the file at the given relative path.
195
:param offsets: A list of (offset, size) tuples.
196
:param return: A list or generator of (offset, data) tuples
198
# Ideally we would pass one big request asking for all the ranges in
199
# one go; however then the server will give a multipart mime response
200
# back, and we can't parse them yet. So instead we just get one range
201
# per region, and try to coallesce the regions as much as possible.
203
# The read-coallescing code is not quite regular enough to have a
204
# single driver routine and
205
# helper method in Transport.
206
def do_combined_read(combined_offsets):
207
# read one coalesced block
209
for offset, size in combined_offsets:
211
mutter('readv coalesced %d reads.', len(combined_offsets))
212
offset = combined_offsets[0][0]
213
byte_range = (offset, offset + total_size - 1)
214
code, result_file = self._get(relpath, [byte_range])
216
for off, size in combined_offsets:
217
result_bytes = result_file.read(size)
218
assert len(result_bytes) == size
219
yield off, result_bytes
221
data = result_file.read(offset + total_size)[offset:offset + total_size]
223
for offset, size in combined_offsets:
224
yield offset, data[pos:pos + size]
229
pending_offsets = deque(offsets)
230
combined_offsets = []
231
while len(pending_offsets):
232
offset, size = pending_offsets.popleft()
233
if not combined_offsets:
234
combined_offsets = [[offset, size]]
236
if (len (combined_offsets) < 500 and
237
combined_offsets[-1][0] + combined_offsets[-1][1] == offset):
239
combined_offsets.append([offset, size])
241
# incompatible, or over the threshold issue a read and yield
242
pending_offsets.appendleft((offset, size))
243
for result in do_combined_read(combined_offsets):
245
combined_offsets = []
246
# whatever is left is a single coalesced request
247
if len(combined_offsets):
248
for result in do_combined_read(combined_offsets):
251
def put(self, relpath, f, mode=None):
145
return get_url(self.abspath(relpath))
146
except (BzrError, urllib2.URLError, IOError), e:
147
raise NoSuchFile(orig_error=e)
149
raise HttpTransportError(orig_error=e)
151
def get_partial(self, relpath, start, length=None):
152
"""Get just part of a file.
154
:param relpath: Path to the file, relative to base
155
:param start: The starting position to read from
156
:param length: The length to read. A length of None indicates
157
read to the end of the file.
158
:return: A file-like object containing at least the specified bytes.
159
Some implementations may return objects which can be read
160
past this length, but this is not guaranteed.
162
# TODO: You can make specialized http requests for just
163
# a portion of the file. Figure out how to do that.
164
# For now, urllib2 returns files that cannot seek() so
165
# we just read bytes off the beginning, until we
166
# get to the point that we care about.
167
f = self.get(relpath)
168
# TODO: read in smaller chunks, in case things are
169
# buffered internally.
173
def put(self, relpath, f):
252
174
"""Copy the file-like or string object into the location.
254
176
:param relpath: Location to put the contents, relative to base.
334
247
raise TransportNotPossible('http does not support lock_write()')
336
def clone(self, offset=None):
337
"""Return a new HttpTransportBase with root at self.base + offset
338
For now HttpTransportBase does not actually connect, so just return
339
a new HttpTransportBase object.
342
return self.__class__(self.base)
344
return self.__class__(self.abspath(offset))
346
#---------------- test server facilities ----------------
347
# TODO: load these only when running tests
350
class WebserverNotAvailable(Exception):
354
class BadWebserverPath(ValueError):
356
return 'path %s is not in %s' % self.args
359
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
361
def log_message(self, format, *args):
362
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
363
self.address_string(),
364
self.log_date_time_string(),
366
self.headers.get('referer', '-'),
367
self.headers.get('user-agent', '-'))
369
def handle_one_request(self):
370
"""Handle a single HTTP request.
372
You normally don't need to override this method; see the class
373
__doc__ string for information on how to handle specific HTTP
374
commands such as GET and POST.
377
for i in xrange(1,11): # Don't try more than 10 times
379
self.raw_requestline = self.rfile.readline()
380
except socket.error, e:
381
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
382
# omitted for now because some tests look at the log of
383
# the server and expect to see no errors. see recent
384
# email thread. -- mbp 20051021.
385
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
391
if not self.raw_requestline:
392
self.close_connection = 1
394
if not self.parse_request(): # An error code has been sent, just exit
396
mname = 'do_' + self.command
397
if not hasattr(self, mname):
398
self.send_error(501, "Unsupported method (%r)" % self.command)
400
method = getattr(self, mname)
403
if sys.platform == 'win32':
404
# On win32 you cannot access non-ascii filenames without
405
# decoding them into unicode first.
406
# However, under Linux, you can access bytestream paths
407
# without any problems. If this function was always active
408
# it would probably break tests when LANG=C was set
409
def translate_path(self, path):
410
"""Translate a /-separated PATH to the local filename syntax.
412
For bzr, all url paths are considered to be utf8 paths.
413
On Linux, you can access these paths directly over the bytestream
414
request, but on win32, you must decode them, and access them
417
# abandon query parameters
418
path = urlparse.urlparse(path)[2]
419
path = posixpath.normpath(urllib.unquote(path))
420
path = path.decode('utf-8')
421
words = path.split('/')
422
words = filter(None, words)
425
drive, word = os.path.splitdrive(word)
426
head, word = os.path.split(word)
427
if word in (os.curdir, os.pardir): continue
428
path = os.path.join(path, word)
432
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
433
def __init__(self, server_address, RequestHandlerClass, test_case):
434
BaseHTTPServer.HTTPServer.__init__(self, server_address,
436
self.test_case = test_case
438
class HttpServer(Server):
439
"""A test server for http transports."""
441
# used to form the url that connects to this server
442
_url_protocol = 'http'
444
def _http_start(self):
446
httpd = TestingHTTPServer(('localhost', 0),
447
TestingHTTPRequestHandler,
449
host, port = httpd.socket.getsockname()
450
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
451
self._http_starting.release()
452
httpd.socket.settimeout(0.1)
454
while self._http_running:
456
httpd.handle_request()
457
except socket.timeout:
460
def _get_remote_url(self, path):
461
path_parts = path.split(os.path.sep)
462
if os.path.isabs(path):
463
if path_parts[:len(self._local_path_parts)] != \
464
self._local_path_parts:
465
raise BadWebserverPath(path, self.test_dir)
466
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
468
remote_path = '/'.join(path_parts)
470
self._http_starting.acquire()
471
self._http_starting.release()
472
return self._http_base_url + remote_path
474
def log(self, format, *args):
475
"""Capture Server log output."""
476
self.logs.append(format % args)
479
"""See bzrlib.transport.Server.setUp."""
480
self._home_dir = os.getcwdu()
481
self._local_path_parts = self._home_dir.split(os.path.sep)
482
self._http_starting = threading.Lock()
483
self._http_starting.acquire()
484
self._http_running = True
485
self._http_base_url = None
486
self._http_thread = threading.Thread(target=self._http_start)
487
self._http_thread.setDaemon(True)
488
self._http_thread.start()
489
self._http_proxy = os.environ.get("http_proxy")
490
if self._http_proxy is not None:
491
del os.environ["http_proxy"]
495
"""See bzrlib.transport.Server.tearDown."""
496
self._http_running = False
497
self._http_thread.join()
498
if self._http_proxy is not None:
500
os.environ["http_proxy"] = self._http_proxy
503
"""See bzrlib.transport.Server.get_url."""
504
return self._get_remote_url(self._home_dir)
506
def get_bogus_url(self):
507
"""See bzrlib.transport.Server.get_bogus_url."""
508
# this is chosen to try to prevent trouble with proxies, wierd dns,
510
return 'http://127.0.0.1:1/'
249
register_transport('http://', HttpTransport)
250
register_transport('https://', HttpTransport)