1
# Copyright (C) 2005-2010 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError)
22
23
from cStringIO import StringIO
36
from bzrlib.smart import medium
27
from bzrlib.errors import BzrError, BzrCheckError
28
from bzrlib.branch import Branch
37
29
from bzrlib.trace import mutter
38
from bzrlib.transport import (
42
# TODO: This is not used anymore by HttpTransport_urllib
43
# (extracting the auth info and prompting the user for a password
44
# have been split), only the tests still use it. It should be
45
# deleted and the tests rewritten ASAP to stay in sync.
46
def extract_auth(url, password_manager):
47
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
48
password manager. Return the url, minus those auth parameters (which
51
if not re.match(r'^(https?)(\+\w+)?://', url):
53
'invalid absolute url %r' % (url,))
54
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
57
auth, netloc = netloc.split('@', 1)
59
username, password = auth.split(':', 1)
61
username, password = auth, None
63
host = netloc.split(':', 1)[0]
66
username = urllib.unquote(username)
67
if password is not None:
68
password = urllib.unquote(password)
70
password = ui.ui_factory.get_password(
71
prompt=u'HTTP %(user)s@%(host)s password',
72
user=username, host=host)
73
password_manager.add_password(None, host, username, password)
74
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
78
class HttpTransportBase(ConnectedTransport):
79
"""Base class for http implementations.
81
Does URL parsing, etc, but not any network IO.
83
The protocol can be given as e.g. http+urllib://host/ to use a particular
87
# _unqualified_scheme: "http" or "https"
88
# _scheme: may have "+pycurl", etc
90
def __init__(self, base, _impl_name, _from_transport=None):
34
mutter("get_url %s" % url)
35
url_f = urllib2.urlopen(url)
38
class HttpTransportError(TransportError):
41
class HttpTransport(Transport):
42
"""This is the transport agent for http:// access.
44
TODO: Implement pipelined versions of all of the *_multi() functions.
47
def __init__(self, base):
91
48
"""Set the base path where files will be stored."""
92
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
94
raise AssertionError("not a http url: %r" % base)
95
self._unqualified_scheme = proto_match.group(1)
96
self._impl_name = _impl_name
97
super(HttpTransportBase, self).__init__(base,
98
_from_transport=_from_transport)
100
# range hint is handled dynamically throughout the life
101
# of the transport object. We start by trying multi-range
102
# requests and if the server returns bogus results, we
103
# retry with single range requests and, finally, we
104
# forget about range if the server really can't
105
# understand. Once acquired, this piece of info is
106
# propagated to clones.
107
if _from_transport is not None:
108
self._range_hint = _from_transport._range_hint
110
self._range_hint = 'multi'
49
assert base.startswith('http://') or base.startswith('https://')
50
super(HttpTransport, self).__init__(base)
51
# In the future we might actually connect to the remote host
52
# rather than using get_url
53
# self._connection = None
54
(self._proto, self._host,
55
self._path, self._parameters,
56
self._query, self._fragment) = urlparse.urlparse(self.base)
58
def should_cache(self):
59
"""Return True if the data pulled across should be cached locally.
63
def clone(self, offset=None):
64
"""Return a new HttpTransport with root at self.base + offset
65
For now HttpTransport does not actually connect, so just return
66
a new HttpTransport object.
69
return HttpTransport(self.base)
71
return HttpTransport(self.abspath(offset))
73
def abspath(self, relpath):
74
"""Return the full url to the given relative path.
75
This can be supplied with a string or a list
77
assert isinstance(relpath, basestring)
78
if isinstance(relpath, basestring):
79
relpath_parts = relpath.split('/')
81
# TODO: Don't call this with an array - no magic interfaces
82
relpath_parts = relpath[:]
83
if len(relpath_parts) > 1:
84
if relpath_parts[0] == '':
85
raise ValueError("path %r within branch %r seems to be absolute"
86
% (relpath, self._path))
87
if relpath_parts[-1] == '':
88
raise ValueError("path %r within branch %r seems to be a directory"
89
% (relpath, self._path))
90
basepath = self._path.split('/')
91
if len(basepath) > 0 and basepath[-1] == '':
92
basepath = basepath[:-1]
93
for p in relpath_parts:
95
if len(basepath) == 0:
96
# In most filesystems, a request for the parent
97
# of root, just returns root.
100
elif p == '.' or p == '':
104
# Possibly, we could use urlparse.urljoin() here, but
105
# I'm concerned about when it chooses to strip the last
106
# portion of the path, and when it doesn't.
107
path = '/'.join(basepath)
108
return urlparse.urlunparse((self._proto,
109
self._host, path, '', '', ''))
112
111
def has(self, relpath):
113
raise NotImplementedError("has() is abstract on %r" % self)
115
def get(self, relpath):
112
"""Does the target location exist?
114
TODO: HttpTransport.has() should use a HEAD request,
115
not a full GET request.
117
TODO: This should be changed so that we don't use
118
urllib2 and get an exception, the code path would be
119
cleaner if we just do an http HEAD request, and parse
123
f = get_url(self.abspath(relpath))
124
# Without the read and then close()
125
# we tend to have busy sockets.
129
except urllib2.URLError, e:
134
if e.errno == errno.ENOENT:
136
raise HttpTransportError(orig_error=e)
138
def get(self, relpath, decode=False):
116
139
"""Get the file at the given relative path.
118
141
:param relpath: The relative path to the file
120
code, response_file = self._get(relpath, None)
121
# FIXME: some callers want an iterable... One step forward, three steps
122
# backwards :-/ And not only an iterable, but an iterable that can be
123
# seeked backwards, so we will never be able to do that. One such
124
# known client is bzrlib.bundle.serializer.v4.get_bundle_reader. At the
125
# time of this writing it's even the only known client -- vila20071203
126
return StringIO(response_file.read())
128
def _get(self, relpath, ranges, tail_amount=0):
129
"""Get a file, or part of a file.
131
:param relpath: Path relative to transport base URL
132
:param ranges: None to get the whole file;
133
or a list of _CoalescedOffset to fetch parts of a file.
134
:param tail_amount: The amount to get from the end of the file.
136
:returns: (http_code, result_file)
138
raise NotImplementedError(self._get)
140
def _remote_path(self, relpath):
141
"""See ConnectedTransport._remote_path.
143
user and passwords are not embedded in the path provided to the server.
145
url = self._parsed_url.clone(relpath)
146
url.user = url.quoted_user = None
147
url.password = url.quoted_password = None
148
url.scheme = self._unqualified_scheme
151
def _create_auth(self):
152
"""Returns a dict containing the credentials provided at build time."""
153
auth = dict(host=self._parsed_url.host, port=self._parsed_url.port,
154
user=self._parsed_url.user, password=self._parsed_url.password,
155
protocol=self._unqualified_scheme,
156
path=self._parsed_url.path)
159
def get_smart_medium(self):
160
"""See Transport.get_smart_medium."""
161
if self._medium is None:
162
# Since medium holds some state (smart server probing at least), we
163
# need to keep it around. Note that this is needed because medium
164
# has the same 'base' attribute as the transport so it can't be
165
# shared between transports having different bases.
166
self._medium = SmartClientHTTPMedium(self)
169
def _degrade_range_hint(self, relpath, ranges, exc_info):
170
if self._range_hint == 'multi':
171
self._range_hint = 'single'
172
mutter('Retry "%s" with single range request' % relpath)
173
elif self._range_hint == 'single':
174
self._range_hint = None
175
mutter('Retry "%s" without ranges' % relpath)
177
# We tried all the tricks, but nothing worked. We re-raise the
178
# original exception; the 'mutter' calls above will indicate that
179
# further tries were unsuccessful
180
raise exc_info[0], exc_info[1], exc_info[2]
182
# _coalesce_offsets is a helper for readv, it try to combine ranges without
183
# degrading readv performances. _bytes_to_read_before_seek is the value
184
# used for the limit parameter and has been tuned for other transports. For
185
# HTTP, the name is inappropriate but the parameter is still useful and
186
# helps reduce the number of chunks in the response. The overhead for a
187
# chunk (headers, length, footer around the data itself is variable but
188
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
189
# the header, some servers (notably Apache) enforce a maximum length for a
190
# header and issue a '400: Bad request' error when too much ranges are
192
_bytes_to_read_before_seek = 128
193
# No limit on the offset number that get combined into one, we are trying
194
# to avoid downloading the whole file.
195
_max_readv_combine = 0
196
# By default Apache has a limit of ~400 ranges before replying with a 400
197
# Bad Request. So we go underneath that amount to be safe.
198
_max_get_ranges = 200
199
# We impose no limit on the range size. But see _pycurl.py for a different
203
def _readv(self, relpath, offsets):
204
"""Get parts of the file at the given relative path.
206
:param offsets: A list of (offset, size) tuples.
207
:param return: A list or generator of (offset, data) tuples
209
# offsets may be a generator, we will iterate it several times, so
211
offsets = list(offsets)
214
retried_offset = None
218
# Coalesce the offsets to minimize the GET requests issued
219
sorted_offsets = sorted(offsets)
220
coalesced = self._coalesce_offsets(
221
sorted_offsets, limit=self._max_readv_combine,
222
fudge_factor=self._bytes_to_read_before_seek,
223
max_size=self._get_max_size)
225
# Turn it into a list, we will iterate it several times
226
coalesced = list(coalesced)
227
if 'http' in debug.debug_flags:
228
mutter('http readv of %s offsets => %s collapsed %s',
229
relpath, len(offsets), len(coalesced))
231
# Cache the data read, but only until it's been used
233
# We will iterate on the data received from the GET requests and
234
# serve the corresponding offsets respecting the initial order. We
235
# need an offset iterator for that.
236
iter_offsets = iter(offsets)
237
cur_offset_and_size = iter_offsets.next()
240
for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
241
# Split the received chunk
242
for offset, size in cur_coal.ranges:
243
start = cur_coal.start + offset
245
data = rfile.read(size)
248
raise errors.ShortReadvError(relpath, start, size,
250
if (start, size) == cur_offset_and_size:
251
# The offset requested are sorted as the coalesced
252
# ones, no need to cache. Win !
253
yield cur_offset_and_size[0], data
254
cur_offset_and_size = iter_offsets.next()
256
# Different sorting. We need to cache.
257
data_map[(start, size)] = data
259
# Yield everything we can
260
while cur_offset_and_size in data_map:
261
# Clean the cached data since we use it
262
# XXX: will break if offsets contains duplicates --
264
this_data = data_map.pop(cur_offset_and_size)
265
yield cur_offset_and_size[0], this_data
266
cur_offset_and_size = iter_offsets.next()
268
except (errors.ShortReadvError, errors.InvalidRange,
269
errors.InvalidHttpRange, errors.HttpBoundaryMissing), e:
270
mutter('Exception %r: %s during http._readv',e, e)
271
if (not isinstance(e, errors.ShortReadvError)
272
or retried_offset == cur_offset_and_size):
273
# We don't degrade the range hint for ShortReadvError since
274
# they do not indicate a problem with the server ability to
275
# handle ranges. Except when we fail to get back a required
276
# offset twice in a row. In that case, falling back to
277
# single range or whole file should help or end up in a
279
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
280
# Some offsets may have been already processed, so we retry
281
# only the unsuccessful ones.
282
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
283
retried_offset = cur_offset_and_size
286
def _coalesce_readv(self, relpath, coalesced):
287
"""Issue several GET requests to satisfy the coalesced offsets"""
289
def get_and_yield(relpath, coalesced):
291
# Note that the _get below may raise
292
# errors.InvalidHttpRange. It's the caller's responsibility to
293
# decide how to retry since it may provide different coalesced
295
code, rfile = self._get(relpath, coalesced)
296
for coal in coalesced:
299
if self._range_hint is None:
300
# Download whole file
301
for c, rfile in get_and_yield(relpath, coalesced):
304
total = len(coalesced)
305
if self._range_hint == 'multi':
306
max_ranges = self._max_get_ranges
307
elif self._range_hint == 'single':
310
raise AssertionError("Unknown _range_hint %r"
311
% (self._range_hint,))
312
# TODO: Some web servers may ignore the range requests and return
313
# the whole file, we may want to detect that and avoid further
315
# Hint: test_readv_multiple_get_requests will fail once we do that
318
for coal in coalesced:
319
if ((self._get_max_size > 0
320
and cumul + coal.length > self._get_max_size)
321
or len(ranges) >= max_ranges):
322
# Get that much and yield
323
for c, rfile in get_and_yield(relpath, ranges):
325
# Restart with the current offset
331
# Get the rest and yield
332
for c, rfile in get_and_yield(relpath, ranges):
335
def recommended_page_size(self):
336
"""See Transport.recommended_page_size().
338
For HTTP we suggest a large page size to reduce the overhead
339
introduced by latency.
343
def _post(self, body_bytes):
344
"""POST body_bytes to .bzr/smart on this transport.
346
:returns: (response code, response body file-like object).
348
# TODO: Requiring all the body_bytes to be available at the beginning of
349
# the POST may require large client buffers. It would be nice to have
350
# an interface that allows streaming via POST when possible (and
351
# degrades to a local buffer when not).
352
raise NotImplementedError(self._post)
354
def put_file(self, relpath, f, mode=None):
355
"""Copy the file-like object into the location.
144
return get_url(self.abspath(relpath))
145
except urllib2.URLError, e:
147
raise NoSuchFile(msg = "Error retrieving %s: %s"
148
% (self.abspath(relpath), str(e)),
151
except (BzrError, IOError), e:
152
raise NoSuchFile(msg = "Error retrieving %s: %s"
153
% (self.abspath(relpath), str(e)),
156
def put(self, relpath, f):
157
"""Copy the file-like or string object into the location.
357
159
:param relpath: Location to put the contents, relative to base.
358
:param f: File-like object.
160
:param f: File-like or string object.
360
raise errors.TransportNotPossible('http PUT not supported')
162
raise TransportNotPossible('http PUT not supported')
362
def mkdir(self, relpath, mode=None):
164
def mkdir(self, relpath):
363
165
"""Create a directory at the given path."""
364
raise errors.TransportNotPossible('http does not support mkdir()')
366
def rmdir(self, relpath):
367
"""See Transport.rmdir."""
368
raise errors.TransportNotPossible('http does not support rmdir()')
370
def append_file(self, relpath, f, mode=None):
166
raise TransportNotPossible('http does not support mkdir()')
168
def append(self, relpath, f):
371
169
"""Append the text in the file-like object into the final
374
raise errors.TransportNotPossible('http does not support append()')
172
raise TransportNotPossible('http does not support append()')
376
174
def copy(self, rel_from, rel_to):
377
175
"""Copy the item at rel_from to the location at rel_to"""
378
raise errors.TransportNotPossible('http does not support copy()')
176
raise TransportNotPossible('http does not support copy()')
380
def copy_to(self, relpaths, other, mode=None, pb=None):
178
def copy_to(self, relpaths, other, pb=None):
381
179
"""Copy a set of entries from self into another Transport.
383
181
:param relpaths: A list/generator of entries to be copied.
444
228
:return: A lock object, which should be passed to Transport.unlock()
446
raise errors.TransportNotPossible('http does not support lock_write()')
448
def _attempted_range_header(self, offsets, tail_amount):
449
"""Prepare a HTTP Range header at a level the server should accept.
451
:return: the range header representing offsets/tail_amount or None if
452
no header can be built.
455
if self._range_hint == 'multi':
456
# Generate the header describing all offsets
457
return self._range_header(offsets, tail_amount)
458
elif self._range_hint == 'single':
459
# Combine all the requested ranges into a single
462
if tail_amount not in (0, None):
463
# Nothing we can do here to combine ranges with tail_amount
464
# in a single range, just returns None. The whole file
465
# should be downloaded.
468
start = offsets[0].start
470
end = last.start + last.length - 1
471
whole = self._coalesce_offsets([(start, end - start + 1)],
472
limit=0, fudge_factor=0)
473
return self._range_header(list(whole), 0)
475
# Only tail_amount, requested, leave range_header
477
return self._range_header(offsets, tail_amount)
482
def _range_header(ranges, tail_amount):
483
"""Turn a list of bytes ranges into a HTTP Range header value.
485
:param ranges: A list of _CoalescedOffset
486
:param tail_amount: The amount to get from the end of the file.
488
:return: HTTP range header string.
490
At least a non-empty ranges *or* a tail_amount must be
494
for offset in ranges:
495
strings.append('%d-%d' % (offset.start,
496
offset.start + offset.length - 1))
499
strings.append('-%d' % tail_amount)
501
return ','.join(strings)
503
def _redirected_to(self, source, target):
504
"""Returns a transport suitable to re-issue a redirected request.
506
:param source: The source url as returned by the server.
507
:param target: The target url as returned by the server.
509
The redirection can be handled only if the relpath involved is not
510
renamed by the redirection.
512
:returns: A transport or None.
514
parsed_source = self._split_url(source)
515
parsed_target = self._split_url(target)
516
pl = len(self._parsed_url.path)
517
# determine the excess tail - the relative path that was in
518
# the original request but not part of this transports' URL.
519
excess_tail = parsed_source.path[pl:].strip("/")
520
if not target.endswith(excess_tail):
521
# The final part of the url has been renamed, we can't handle the
525
target_path = parsed_target.path
527
# Drop the tail that was in the redirect but not part of
528
# the path of this transport.
529
target_path = target_path[:-len(excess_tail)]
531
if parsed_target.scheme in ('http', 'https'):
532
# Same protocol family (i.e. http[s]), we will preserve the same
533
# http client implementation when a redirection occurs from one to
534
# the other (otherwise users may be surprised that bzr switches
535
# from one implementation to the other, and devs may suffer
537
if (parsed_target.scheme == self._unqualified_scheme
538
and parsed_target.host == self._parsed_url.host
539
and parsed_target.port == self._parsed_url.port
540
and (parsed_target.user is None or
541
parsed_target.user == self._parsed_url.user)):
542
# If a user is specified, it should match, we don't care about
543
# passwords, wrong passwords will be rejected anyway.
544
return self.clone(target_path)
546
# Rebuild the url preserving the scheme qualification and the
547
# credentials (if they don't apply, the redirected to server
548
# will tell us, but if they do apply, we avoid prompting the
550
redir_scheme = parsed_target.scheme + '+' + self._impl_name
551
new_url = self._unsplit_url(redir_scheme,
552
self._parsed_url.user,
553
self._parsed_url.password,
554
parsed_target.host, parsed_target.port,
556
return transport.get_transport_from_url(new_url)
558
# Redirected to a different protocol
559
new_url = self._unsplit_url(parsed_target.scheme,
561
parsed_target.password,
562
parsed_target.host, parsed_target.port,
564
return transport.get_transport_from_url(new_url)
567
# TODO: May be better located in smart/medium.py with the other
568
# SmartMedium classes
569
class SmartClientHTTPMedium(medium.SmartClientMedium):
571
def __init__(self, http_transport):
572
super(SmartClientHTTPMedium, self).__init__(http_transport.base)
573
# We don't want to create a circular reference between the http
574
# transport and its associated medium. Since the transport will live
575
# longer than the medium, the medium keep only a weak reference to its
577
self._http_transport_ref = weakref.ref(http_transport)
579
def get_request(self):
580
return SmartClientHTTPMediumRequest(self)
582
def should_probe(self):
585
def remote_path_from_transport(self, transport):
586
# Strip the optional 'bzr+' prefix from transport so it will have the
587
# same scheme as self.
588
transport_base = transport.base
589
if transport_base.startswith('bzr+'):
590
transport_base = transport_base[4:]
591
rel_url = urlutils.relative_url(self.base, transport_base)
592
return urllib.unquote(rel_url)
594
def send_http_smart_request(self, bytes):
596
# Get back the http_transport hold by the weak reference
597
t = self._http_transport_ref()
598
code, body_filelike = t._post(bytes)
600
raise errors.InvalidHttpResponse(
601
t._remote_path('.bzr/smart'),
602
'Expected 200 response code, got %r' % (code,))
603
except (errors.InvalidHttpResponse, errors.ConnectionReset), e:
604
raise errors.SmartProtocolError(str(e))
607
def _report_activity(self, bytes, direction):
608
"""See SmartMedium._report_activity.
610
Does nothing; the underlying plain HTTP transport will report the
611
activity that this medium would report.
615
def disconnect(self):
616
"""See SmartClientMedium.disconnect()."""
617
t = self._http_transport_ref()
621
# TODO: May be better located in smart/medium.py with the other
622
# SmartMediumRequest classes
623
class SmartClientHTTPMediumRequest(medium.SmartClientMediumRequest):
624
"""A SmartClientMediumRequest that works with an HTTP medium."""
626
def __init__(self, client_medium):
627
medium.SmartClientMediumRequest.__init__(self, client_medium)
630
def _accept_bytes(self, bytes):
631
self._buffer += bytes
633
def _finished_writing(self):
634
data = self._medium.send_http_smart_request(self._buffer)
635
self._response_body = data
637
def _read_bytes(self, count):
638
"""See SmartClientMediumRequest._read_bytes."""
639
return self._response_body.read(count)
641
def _read_line(self):
642
line, excess = medium._get_line(self._response_body.read)
644
raise AssertionError(
645
'_get_line returned excess bytes, but this mediumrequest '
646
'cannot handle excess. (%r)' % (excess,))
649
def _finished_reading(self):
650
"""See SmartClientMediumRequest._finished_reading."""
654
def unhtml_roughly(maybe_html, length_limit=1000):
655
"""Very approximate html->text translation, for presenting error bodies.
657
:param length_limit: Truncate the result to this many characters.
659
>>> unhtml_roughly("<b>bad</b> things happened\\n")
660
' bad things happened '
662
return re.subn(r"(<[^>]*>|\n| )", " ", maybe_html)[0][:length_limit]
230
raise TransportNotPossible('http does not support lock_write()')