1
# Copyright (C) 2005, 2006, 2007 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError)
22
23
from cStringIO import StringIO
34
from bzrlib.smart import medium
35
from bzrlib.symbol_versioning import (
27
from bzrlib.errors import BzrError, BzrCheckError
28
from bzrlib.branch import Branch
39
29
from bzrlib.trace import mutter
40
from bzrlib.transport import (
46
# TODO: This is not used anymore by HttpTransport_urllib
47
# (extracting the auth info and prompting the user for a password
48
# have been split), only the tests still use it. It should be
49
# deleted and the tests rewritten ASAP to stay in sync.
50
def extract_auth(url, password_manager):
51
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
52
password manager. Return the url, minus those auth parameters (which
55
assert re.match(r'^(https?)(\+\w+)?://', url), \
56
'invalid absolute url %r' % url
57
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
60
auth, netloc = netloc.split('@', 1)
62
username, password = auth.split(':', 1)
64
username, password = auth, None
66
host = netloc.split(':', 1)[0]
69
username = urllib.unquote(username)
70
if password is not None:
71
password = urllib.unquote(password)
73
password = ui.ui_factory.get_password(
74
prompt='HTTP %(user)s@%(host)s password',
75
user=username, host=host)
76
password_manager.add_password(None, host, username, password)
77
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
81
class HttpTransportBase(ConnectedTransport, medium.SmartClientMedium):
82
"""Base class for http implementations.
84
Does URL parsing, etc, but not any network IO.
86
The protocol can be given as e.g. http+urllib://host/ to use a particular
90
# _unqualified_scheme: "http" or "https"
91
# _scheme: may have "+pycurl", etc
93
def __init__(self, base, _from_transport=None):
34
mutter("get_url %s" % url)
35
url_f = urllib2.urlopen(url)
38
class HttpTransportError(TransportError):
41
class HttpTransport(Transport):
42
"""This is the transport agent for http:// access.
44
TODO: Implement pipelined versions of all of the *_multi() functions.
47
def __init__(self, base):
94
48
"""Set the base path where files will be stored."""
95
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
97
raise AssertionError("not a http url: %r" % base)
98
self._unqualified_scheme = proto_match.group(1)
99
impl_name = proto_match.group(2)
101
impl_name = impl_name[1:]
102
self._impl_name = impl_name
103
super(HttpTransportBase, self).__init__(base,
104
_from_transport=_from_transport)
105
# range hint is handled dynamically throughout the life
106
# of the transport object. We start by trying multi-range
107
# requests and if the server returns bogus results, we
108
# retry with single range requests and, finally, we
109
# forget about range if the server really can't
110
# understand. Once acquired, this piece of info is
111
# propagated to clones.
112
if _from_transport is not None:
113
self._range_hint = _from_transport._range_hint
115
self._range_hint = 'multi'
49
assert base.startswith('http://') or base.startswith('https://')
50
super(HttpTransport, self).__init__(base)
51
# In the future we might actually connect to the remote host
52
# rather than using get_url
53
# self._connection = None
54
(self._proto, self._host,
55
self._path, self._parameters,
56
self._query, self._fragment) = urlparse.urlparse(self.base)
58
def should_cache(self):
59
"""Return True if the data pulled across should be cached locally.
63
def clone(self, offset=None):
64
"""Return a new HttpTransport with root at self.base + offset
65
For now HttpTransport does not actually connect, so just return
66
a new HttpTransport object.
69
return HttpTransport(self.base)
71
return HttpTransport(self.abspath(offset))
73
def abspath(self, relpath):
74
"""Return the full url to the given relative path.
75
This can be supplied with a string or a list
77
assert isinstance(relpath, basestring)
78
if isinstance(relpath, basestring):
79
relpath_parts = relpath.split('/')
81
# TODO: Don't call this with an array - no magic interfaces
82
relpath_parts = relpath[:]
83
if len(relpath_parts) > 1:
84
if relpath_parts[0] == '':
85
raise ValueError("path %r within branch %r seems to be absolute"
86
% (relpath, self._path))
87
if relpath_parts[-1] == '':
88
raise ValueError("path %r within branch %r seems to be a directory"
89
% (relpath, self._path))
90
basepath = self._path.split('/')
91
if len(basepath) > 0 and basepath[-1] == '':
92
basepath = basepath[:-1]
93
for p in relpath_parts:
95
if len(basepath) == 0:
96
# In most filesystems, a request for the parent
97
# of root, just returns root.
100
elif p == '.' or p == '':
104
# Possibly, we could use urlparse.urljoin() here, but
105
# I'm concerned about when it chooses to strip the last
106
# portion of the path, and when it doesn't.
107
path = '/'.join(basepath)
108
return urlparse.urlunparse((self._proto,
109
self._host, path, '', '', ''))
117
111
def has(self, relpath):
118
raise NotImplementedError("has() is abstract on %r" % self)
120
def get(self, relpath):
112
"""Does the target location exist?
114
TODO: HttpTransport.has() should use a HEAD request,
115
not a full GET request.
117
TODO: This should be changed so that we don't use
118
urllib2 and get an exception, the code path would be
119
cleaner if we just do an http HEAD request, and parse
123
f = get_url(self.abspath(relpath))
124
# Without the read and then close()
125
# we tend to have busy sockets.
129
except urllib2.URLError, e:
134
if e.errno == errno.ENOENT:
136
raise HttpTransportError(orig_error=e)
138
def get(self, relpath, decode=False):
121
139
"""Get the file at the given relative path.
123
141
:param relpath: The relative path to the file
125
code, response_file = self._get(relpath, None)
126
# FIXME: some callers want an iterable... One step forward, three steps
127
# backwards :-/ And not only an iterable, but an iterable that can be
128
# seeked backwards, so we will never be able to do that. One such
129
# known client is bzrlib.bundle.serializer.v4.get_bundle_reader. At the
130
# time of this writing it's even the only known client -- vila20071203
131
return StringIO(response_file.read())
133
def _get(self, relpath, ranges, tail_amount=0):
134
"""Get a file, or part of a file.
136
:param relpath: Path relative to transport base URL
137
:param ranges: None to get the whole file;
138
or a list of _CoalescedOffset to fetch parts of a file.
139
:param tail_amount: The amount to get from the end of the file.
141
:returns: (http_code, result_file)
143
raise NotImplementedError(self._get)
145
def _remote_path(self, relpath):
146
"""See ConnectedTransport._remote_path.
148
user and passwords are not embedded in the path provided to the server.
150
relative = urlutils.unescape(relpath).encode('utf-8')
151
path = self._combine_paths(self._path, relative)
152
return self._unsplit_url(self._unqualified_scheme,
153
None, None, self._host, self._port, path)
155
def _create_auth(self):
156
"""Returns a dict returning the credentials provided at build time."""
157
auth = dict(host=self._host, port=self._port,
158
user=self._user, password=self._password,
159
protocol=self._unqualified_scheme,
163
def get_request(self):
164
return SmartClientHTTPMediumRequest(self)
166
def get_smart_medium(self):
167
"""See Transport.get_smart_medium.
169
HttpTransportBase directly implements the minimal interface of
170
SmartMediumClient, so this returns self.
174
def _degrade_range_hint(self, relpath, ranges, exc_info):
175
if self._range_hint == 'multi':
176
self._range_hint = 'single'
177
mutter('Retry "%s" with single range request' % relpath)
178
elif self._range_hint == 'single':
179
self._range_hint = None
180
mutter('Retry "%s" without ranges' % relpath)
182
# We tried all the tricks, but nothing worked. We re-raise the
183
# original exception; the 'mutter' calls above will indicate that
184
# further tries were unsuccessful
185
raise exc_info[0], exc_info[1], exc_info[2]
187
# _coalesce_offsets is a helper for readv, it try to combine ranges without
188
# degrading readv performances. _bytes_to_read_before_seek is the value
189
# used for the limit parameter and has been tuned for other transports. For
190
# HTTP, the name is inappropriate but the parameter is still useful and
191
# helps reduce the number of chunks in the response. The overhead for a
192
# chunk (headers, length, footer around the data itself is variable but
193
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
194
# the header, some servers (notably Apache) enforce a maximum length for a
195
# header and issue a '400: Bad request' error when too much ranges are
197
_bytes_to_read_before_seek = 128
198
# No limit on the offset number that get combined into one, we are trying
199
# to avoid downloading the whole file.
200
_max_readv_combine = 0
201
# By default Apache has a limit of ~400 ranges before replying with a 400
202
# Bad Request. So we go underneath that amount to be safe.
203
_max_get_ranges = 200
204
# We impose no limit on the range size. But see _pycurl.py for a different
208
def _readv(self, relpath, offsets):
209
"""Get parts of the file at the given relative path.
211
:param offsets: A list of (offset, size) tuples.
212
:param return: A list or generator of (offset, data) tuples
215
# offsets may be a generator, we will iterate it several times, so
217
offsets = list(offsets)
220
retried_offset = None
224
# Coalesce the offsets to minimize the GET requests issued
225
sorted_offsets = sorted(offsets)
226
coalesced = self._coalesce_offsets(
227
sorted_offsets, limit=self._max_readv_combine,
228
fudge_factor=self._bytes_to_read_before_seek,
229
max_size=self._get_max_size)
231
# Turn it into a list, we will iterate it several times
232
coalesced = list(coalesced)
233
mutter('http readv of %s offsets => %s collapsed %s',
234
relpath, len(offsets), len(coalesced))
236
# Cache the data read, but only until it's been used
238
# We will iterate on the data received from the GET requests and
239
# serve the corresponding offsets respecting the initial order. We
240
# need an offset iterator for that.
241
iter_offsets = iter(offsets)
242
cur_offset_and_size = iter_offsets.next()
245
for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
246
# Split the received chunk
247
for offset, size in cur_coal.ranges:
248
start = cur_coal.start + offset
250
data = rfile.read(size)
253
raise errors.ShortReadvError(relpath, start, size,
255
if (start, size) == cur_offset_and_size:
256
# The offset requested are sorted as the coalesced
257
# ones, no need to cache. Win !
258
yield cur_offset_and_size[0], data
259
cur_offset_and_size = iter_offsets.next()
261
# Different sorting. We need to cache.
262
data_map[(start, size)] = data
264
# Yield everything we can
265
while cur_offset_and_size in data_map:
266
# Clean the cached data since we use it
267
# XXX: will break if offsets contains duplicates --
269
this_data = data_map.pop(cur_offset_and_size)
270
yield cur_offset_and_size[0], this_data
271
cur_offset_and_size = iter_offsets.next()
273
except (errors.ShortReadvError, errors.InvalidRange,
274
errors.InvalidHttpRange), e:
275
mutter('Exception %r: %s during http._readv',e, e)
276
if (not isinstance(e, errors.ShortReadvError)
277
or retried_offset == cur_offset_and_size):
278
# We don't degrade the range hint for ShortReadvError since
279
# they do not indicate a problem with the server ability to
280
# handle ranges. Except when we fail to get back a required
281
# offset twice in a row. In that case, falling back to
282
# single range or whole file should help or end up in a
284
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
285
# Some offsets may have been already processed, so we retry
286
# only the unsuccessful ones.
287
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
288
retried_offset = cur_offset_and_size
291
def _coalesce_readv(self, relpath, coalesced):
292
"""Issue several GET requests to satisfy the coalesced offsets"""
294
def get_and_yield(relpath, coalesced):
296
# Note that the _get below may raise
297
# errors.InvalidHttpRange. It's the caller's responsibility to
298
# decide how to retry since it may provide different coalesced
300
code, rfile = self._get(relpath, coalesced)
301
for coal in coalesced:
304
if self._range_hint is None:
305
# Download whole file
306
for c, rfile in get_and_yield(relpath, coalesced):
309
total = len(coalesced)
310
if self._range_hint == 'multi':
311
max_ranges = self._max_get_ranges
312
elif self._range_hint == 'single':
315
raise AssertionError("Unknown _range_hint %r"
316
% (self._range_hint,))
317
# TODO: Some web servers may ignore the range requests and return
318
# the whole file, we may want to detect that and avoid further
320
# Hint: test_readv_multiple_get_requests will fail once we do that
323
for coal in coalesced:
324
if ((self._get_max_size > 0
325
and cumul + coal.length > self._get_max_size)
326
or len(ranges) >= max_ranges):
327
# Get that much and yield
328
for c, rfile in get_and_yield(relpath, ranges):
330
# Restart with the current offset
336
# Get the rest and yield
337
for c, rfile in get_and_yield(relpath, ranges):
340
def recommended_page_size(self):
341
"""See Transport.recommended_page_size().
343
For HTTP we suggest a large page size to reduce the overhead
344
introduced by latency.
348
def _post(self, body_bytes):
349
"""POST body_bytes to .bzr/smart on this transport.
351
:returns: (response code, response body file-like object).
353
# TODO: Requiring all the body_bytes to be available at the beginning of
354
# the POST may require large client buffers. It would be nice to have
355
# an interface that allows streaming via POST when possible (and
356
# degrades to a local buffer when not).
357
raise NotImplementedError(self._post)
359
def put_file(self, relpath, f, mode=None):
360
"""Copy the file-like object into the location.
144
return get_url(self.abspath(relpath))
145
except urllib2.URLError, e:
147
raise NoSuchFile(msg = "Error retrieving %s: %s"
148
% (self.abspath(relpath), str(e)),
151
except (BzrError, IOError), e:
152
raise NoSuchFile(msg = "Error retrieving %s: %s"
153
% (self.abspath(relpath), str(e)),
156
def put(self, relpath, f):
157
"""Copy the file-like or string object into the location.
362
159
:param relpath: Location to put the contents, relative to base.
363
:param f: File-like object.
160
:param f: File-like or string object.
365
raise errors.TransportNotPossible('http PUT not supported')
162
raise TransportNotPossible('http PUT not supported')
367
def mkdir(self, relpath, mode=None):
164
def mkdir(self, relpath):
368
165
"""Create a directory at the given path."""
369
raise errors.TransportNotPossible('http does not support mkdir()')
371
def rmdir(self, relpath):
372
"""See Transport.rmdir."""
373
raise errors.TransportNotPossible('http does not support rmdir()')
375
def append_file(self, relpath, f, mode=None):
166
raise TransportNotPossible('http does not support mkdir()')
168
def append(self, relpath, f):
376
169
"""Append the text in the file-like object into the final
379
raise errors.TransportNotPossible('http does not support append()')
172
raise TransportNotPossible('http does not support append()')
381
174
def copy(self, rel_from, rel_to):
382
175
"""Copy the item at rel_from to the location at rel_to"""
383
raise errors.TransportNotPossible('http does not support copy()')
176
raise TransportNotPossible('http does not support copy()')
385
def copy_to(self, relpaths, other, mode=None, pb=None):
178
def copy_to(self, relpaths, other, pb=None):
386
179
"""Copy a set of entries from self into another Transport.
388
181
:param relpaths: A list/generator of entries to be copied.