1
# Copyright (C) 2005, 2006, 2007 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError)
22
23
from cStringIO import StringIO
36
from bzrlib.smart import medium
37
from bzrlib.symbol_versioning import (
27
from bzrlib.errors import BzrError, BzrCheckError
28
from bzrlib.branch import Branch
40
29
from bzrlib.trace import mutter
41
from bzrlib.transport import (
47
# TODO: This is not used anymore by HttpTransport_urllib
48
# (extracting the auth info and prompting the user for a password
49
# have been split), only the tests still use it. It should be
50
# deleted and the tests rewritten ASAP to stay in sync.
51
def extract_auth(url, password_manager):
52
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
53
password manager. Return the url, minus those auth parameters (which
56
if not re.match(r'^(https?)(\+\w+)?://', url):
58
'invalid absolute url %r' % (url,))
59
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
62
auth, netloc = netloc.split('@', 1)
64
username, password = auth.split(':', 1)
66
username, password = auth, None
68
host = netloc.split(':', 1)[0]
71
username = urllib.unquote(username)
72
if password is not None:
73
password = urllib.unquote(password)
75
password = ui.ui_factory.get_password(
76
prompt='HTTP %(user)s@%(host)s password',
77
user=username, host=host)
78
password_manager.add_password(None, host, username, password)
79
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
83
class HttpTransportBase(ConnectedTransport):
84
"""Base class for http implementations.
86
Does URL parsing, etc, but not any network IO.
88
The protocol can be given as e.g. http+urllib://host/ to use a particular
92
# _unqualified_scheme: "http" or "https"
93
# _scheme: may have "+pycurl", etc
95
def __init__(self, base, _from_transport=None):
31
# velocitynet.com.au transparently proxies connections and thereby
32
# breaks keep-alive -- sucks!
37
mutter("get_url %s" % url)
38
url_f = urllib2.urlopen(url)
41
class HttpTransportError(TransportError):
44
class HttpTransport(Transport):
45
"""This is the transport agent for http:// access.
47
TODO: Implement pipelined versions of all of the *_multi() functions.
50
def __init__(self, base):
96
51
"""Set the base path where files will be stored."""
97
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
99
raise AssertionError("not a http url: %r" % base)
100
self._unqualified_scheme = proto_match.group(1)
101
impl_name = proto_match.group(2)
103
impl_name = impl_name[1:]
104
self._impl_name = impl_name
105
super(HttpTransportBase, self).__init__(base,
106
_from_transport=_from_transport)
108
# range hint is handled dynamically throughout the life
109
# of the transport object. We start by trying multi-range
110
# requests and if the server returns bogus results, we
111
# retry with single range requests and, finally, we
112
# forget about range if the server really can't
113
# understand. Once acquired, this piece of info is
114
# propagated to clones.
115
if _from_transport is not None:
116
self._range_hint = _from_transport._range_hint
52
assert base.startswith('http://') or base.startswith('https://')
53
super(HttpTransport, self).__init__(base)
54
# In the future we might actually connect to the remote host
55
# rather than using get_url
56
# self._connection = None
57
(self._proto, self._host,
58
self._path, self._parameters,
59
self._query, self._fragment) = urlparse.urlparse(self.base)
61
def should_cache(self):
62
"""Return True if the data pulled across should be cached locally.
66
def clone(self, offset=None):
67
"""Return a new HttpTransport with root at self.base + offset
68
For now HttpTransport does not actually connect, so just return
69
a new HttpTransport object.
72
return HttpTransport(self.base)
118
self._range_hint = 'multi'
74
return HttpTransport(self.abspath(offset))
76
def abspath(self, relpath):
77
"""Return the full url to the given relative path.
78
This can be supplied with a string or a list
80
if isinstance(relpath, basestring):
82
basepath = self._path.split('/')
83
if len(basepath) > 0 and basepath[-1] == '':
84
basepath = basepath[:-1]
89
# In most filesystems, a request for the parent
90
# of root, just returns root.
99
# Possibly, we could use urlparse.urljoin() here, but
100
# I'm concerned about when it chooses to strip the last
101
# portion of the path, and when it doesn't.
102
path = '/'.join(basepath)
103
return urlparse.urlunparse((self._proto,
104
self._host, path, '', '', ''))
106
def relpath(self, abspath):
107
if not abspath.startswith(self.base):
108
raise NonRelativePath('path %r is not under base URL %r'
109
% (abspath, self.base))
111
return abspath[pl:].lstrip('/')
120
113
def has(self, relpath):
121
raise NotImplementedError("has() is abstract on %r" % self)
123
def get(self, relpath):
114
"""Does the target location exist?
116
TODO: HttpTransport.has() should use a HEAD request,
117
not a full GET request.
119
TODO: This should be changed so that we don't use
120
urllib2 and get an exception, the code path would be
121
cleaner if we just do an http HEAD request, and parse
125
f = get_url(self.abspath(relpath))
126
# Without the read and then close()
127
# we tend to have busy sockets.
133
except urllib2.URLError:
136
if e.errno == errno.ENOENT:
138
raise HttpTransportError(orig_error=e)
140
def get(self, relpath, decode=False):
124
141
"""Get the file at the given relative path.
126
143
:param relpath: The relative path to the file
128
code, response_file = self._get(relpath, None)
129
# FIXME: some callers want an iterable... One step forward, three steps
130
# backwards :-/ And not only an iterable, but an iterable that can be
131
# seeked backwards, so we will never be able to do that. One such
132
# known client is bzrlib.bundle.serializer.v4.get_bundle_reader. At the
133
# time of this writing it's even the only known client -- vila20071203
134
return StringIO(response_file.read())
136
def _get(self, relpath, ranges, tail_amount=0):
137
"""Get a file, or part of a file.
139
:param relpath: Path relative to transport base URL
140
:param ranges: None to get the whole file;
141
or a list of _CoalescedOffset to fetch parts of a file.
142
:param tail_amount: The amount to get from the end of the file.
144
:returns: (http_code, result_file)
146
raise NotImplementedError(self._get)
148
def _remote_path(self, relpath):
149
"""See ConnectedTransport._remote_path.
151
user and passwords are not embedded in the path provided to the server.
153
relative = urlutils.unescape(relpath).encode('utf-8')
154
path = self._combine_paths(self._path, relative)
155
return self._unsplit_url(self._unqualified_scheme,
156
None, None, self._host, self._port, path)
158
def _create_auth(self):
159
"""Returns a dict returning the credentials provided at build time."""
160
auth = dict(host=self._host, port=self._port,
161
user=self._user, password=self._password,
162
protocol=self._unqualified_scheme,
166
def get_smart_medium(self):
167
"""See Transport.get_smart_medium."""
168
if self._medium is None:
169
# Since medium holds some state (smart server probing at least), we
170
# need to keep it around. Note that this is needed because medium
171
# has the same 'base' attribute as the transport so it can't be
172
# shared between transports having different bases.
173
self._medium = SmartClientHTTPMedium(self)
177
def _degrade_range_hint(self, relpath, ranges, exc_info):
178
if self._range_hint == 'multi':
179
self._range_hint = 'single'
180
mutter('Retry "%s" with single range request' % relpath)
181
elif self._range_hint == 'single':
182
self._range_hint = None
183
mutter('Retry "%s" without ranges' % relpath)
185
# We tried all the tricks, but nothing worked. We re-raise the
186
# original exception; the 'mutter' calls above will indicate that
187
# further tries were unsuccessful
188
raise exc_info[0], exc_info[1], exc_info[2]
190
# _coalesce_offsets is a helper for readv, it try to combine ranges without
191
# degrading readv performances. _bytes_to_read_before_seek is the value
192
# used for the limit parameter and has been tuned for other transports. For
193
# HTTP, the name is inappropriate but the parameter is still useful and
194
# helps reduce the number of chunks in the response. The overhead for a
195
# chunk (headers, length, footer around the data itself is variable but
196
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
197
# the header, some servers (notably Apache) enforce a maximum length for a
198
# header and issue a '400: Bad request' error when too much ranges are
200
_bytes_to_read_before_seek = 128
201
# No limit on the offset number that get combined into one, we are trying
202
# to avoid downloading the whole file.
203
_max_readv_combine = 0
204
# By default Apache has a limit of ~400 ranges before replying with a 400
205
# Bad Request. So we go underneath that amount to be safe.
206
_max_get_ranges = 200
207
# We impose no limit on the range size. But see _pycurl.py for a different
211
def _readv(self, relpath, offsets):
212
"""Get parts of the file at the given relative path.
214
:param offsets: A list of (offset, size) tuples.
215
:param return: A list or generator of (offset, data) tuples
218
# offsets may be a generator, we will iterate it several times, so
220
offsets = list(offsets)
223
retried_offset = None
227
# Coalesce the offsets to minimize the GET requests issued
228
sorted_offsets = sorted(offsets)
229
coalesced = self._coalesce_offsets(
230
sorted_offsets, limit=self._max_readv_combine,
231
fudge_factor=self._bytes_to_read_before_seek,
232
max_size=self._get_max_size)
234
# Turn it into a list, we will iterate it several times
235
coalesced = list(coalesced)
236
if 'http' in debug.debug_flags:
237
mutter('http readv of %s offsets => %s collapsed %s',
238
relpath, len(offsets), len(coalesced))
240
# Cache the data read, but only until it's been used
242
# We will iterate on the data received from the GET requests and
243
# serve the corresponding offsets respecting the initial order. We
244
# need an offset iterator for that.
245
iter_offsets = iter(offsets)
246
cur_offset_and_size = iter_offsets.next()
249
for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
250
# Split the received chunk
251
for offset, size in cur_coal.ranges:
252
start = cur_coal.start + offset
254
data = rfile.read(size)
257
raise errors.ShortReadvError(relpath, start, size,
259
if (start, size) == cur_offset_and_size:
260
# The offset requested are sorted as the coalesced
261
# ones, no need to cache. Win !
262
yield cur_offset_and_size[0], data
263
cur_offset_and_size = iter_offsets.next()
265
# Different sorting. We need to cache.
266
data_map[(start, size)] = data
268
# Yield everything we can
269
while cur_offset_and_size in data_map:
270
# Clean the cached data since we use it
271
# XXX: will break if offsets contains duplicates --
273
this_data = data_map.pop(cur_offset_and_size)
274
yield cur_offset_and_size[0], this_data
275
cur_offset_and_size = iter_offsets.next()
277
except (errors.ShortReadvError, errors.InvalidRange,
278
errors.InvalidHttpRange), e:
279
mutter('Exception %r: %s during http._readv',e, e)
280
if (not isinstance(e, errors.ShortReadvError)
281
or retried_offset == cur_offset_and_size):
282
# We don't degrade the range hint for ShortReadvError since
283
# they do not indicate a problem with the server ability to
284
# handle ranges. Except when we fail to get back a required
285
# offset twice in a row. In that case, falling back to
286
# single range or whole file should help or end up in a
288
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
289
# Some offsets may have been already processed, so we retry
290
# only the unsuccessful ones.
291
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
292
retried_offset = cur_offset_and_size
295
def _coalesce_readv(self, relpath, coalesced):
296
"""Issue several GET requests to satisfy the coalesced offsets"""
298
def get_and_yield(relpath, coalesced):
300
# Note that the _get below may raise
301
# errors.InvalidHttpRange. It's the caller's responsibility to
302
# decide how to retry since it may provide different coalesced
304
code, rfile = self._get(relpath, coalesced)
305
for coal in coalesced:
308
if self._range_hint is None:
309
# Download whole file
310
for c, rfile in get_and_yield(relpath, coalesced):
313
total = len(coalesced)
314
if self._range_hint == 'multi':
315
max_ranges = self._max_get_ranges
316
elif self._range_hint == 'single':
319
raise AssertionError("Unknown _range_hint %r"
320
% (self._range_hint,))
321
# TODO: Some web servers may ignore the range requests and return
322
# the whole file, we may want to detect that and avoid further
324
# Hint: test_readv_multiple_get_requests will fail once we do that
327
for coal in coalesced:
328
if ((self._get_max_size > 0
329
and cumul + coal.length > self._get_max_size)
330
or len(ranges) >= max_ranges):
331
# Get that much and yield
332
for c, rfile in get_and_yield(relpath, ranges):
334
# Restart with the current offset
340
# Get the rest and yield
341
for c, rfile in get_and_yield(relpath, ranges):
344
def recommended_page_size(self):
345
"""See Transport.recommended_page_size().
347
For HTTP we suggest a large page size to reduce the overhead
348
introduced by latency.
352
def _post(self, body_bytes):
353
"""POST body_bytes to .bzr/smart on this transport.
355
:returns: (response code, response body file-like object).
357
# TODO: Requiring all the body_bytes to be available at the beginning of
358
# the POST may require large client buffers. It would be nice to have
359
# an interface that allows streaming via POST when possible (and
360
# degrades to a local buffer when not).
361
raise NotImplementedError(self._post)
363
def put_file(self, relpath, f, mode=None):
364
"""Copy the file-like object into the location.
146
return get_url(self.abspath(relpath))
147
except (BzrError, urllib2.URLError, IOError), e:
148
raise NoSuchFile(msg = "Error retrieving %s: %s"
149
% (self.abspath(relpath), str(e)),
152
def get_partial(self, relpath, start, length=None):
153
"""Get just part of a file.
155
:param relpath: Path to the file, relative to base
156
:param start: The starting position to read from
157
:param length: The length to read. A length of None indicates
158
read to the end of the file.
159
:return: A file-like object containing at least the specified bytes.
160
Some implementations may return objects which can be read
161
past this length, but this is not guaranteed.
163
# TODO: You can make specialized http requests for just
164
# a portion of the file. Figure out how to do that.
165
# For now, urllib2 returns files that cannot seek() so
166
# we just read bytes off the beginning, until we
167
# get to the point that we care about.
168
f = self.get(relpath)
169
# TODO: read in smaller chunks, in case things are
170
# buffered internally.
174
def put(self, relpath, f):
175
"""Copy the file-like or string object into the location.
366
177
:param relpath: Location to put the contents, relative to base.
367
:param f: File-like object.
178
:param f: File-like or string object.
369
raise errors.TransportNotPossible('http PUT not supported')
180
raise TransportNotPossible('http PUT not supported')
371
def mkdir(self, relpath, mode=None):
182
def mkdir(self, relpath):
372
183
"""Create a directory at the given path."""
373
raise errors.TransportNotPossible('http does not support mkdir()')
375
def rmdir(self, relpath):
376
"""See Transport.rmdir."""
377
raise errors.TransportNotPossible('http does not support rmdir()')
379
def append_file(self, relpath, f, mode=None):
184
raise TransportNotPossible('http does not support mkdir()')
186
def append(self, relpath, f):
380
187
"""Append the text in the file-like object into the final
383
raise errors.TransportNotPossible('http does not support append()')
190
raise TransportNotPossible('http does not support append()')
385
192
def copy(self, rel_from, rel_to):
386
193
"""Copy the item at rel_from to the location at rel_to"""
387
raise errors.TransportNotPossible('http does not support copy()')
194
raise TransportNotPossible('http does not support copy()')
389
def copy_to(self, relpaths, other, mode=None, pb=None):
196
def copy_to(self, relpaths, other, pb=None):
390
197
"""Copy a set of entries from self into another Transport.
392
199
:param relpaths: A list/generator of entries to be copied.
450
246
:return: A lock object, which should be passed to Transport.unlock()
452
raise errors.TransportNotPossible('http does not support lock_write()')
454
def clone(self, offset=None):
455
"""Return a new HttpTransportBase with root at self.base + offset
457
We leave the daughter classes take advantage of the hint
458
that it's a cloning not a raw creation.
461
return self.__class__(self.base, self)
463
return self.__class__(self.abspath(offset), self)
465
def _attempted_range_header(self, offsets, tail_amount):
466
"""Prepare a HTTP Range header at a level the server should accept.
468
:return: the range header representing offsets/tail_amount or None if
469
no header can be built.
472
if self._range_hint == 'multi':
473
# Generate the header describing all offsets
474
return self._range_header(offsets, tail_amount)
475
elif self._range_hint == 'single':
476
# Combine all the requested ranges into a single
479
if tail_amount not in (0, None):
480
# Nothing we can do here to combine ranges with tail_amount
481
# in a single range, just returns None. The whole file
482
# should be downloaded.
485
start = offsets[0].start
487
end = last.start + last.length - 1
488
whole = self._coalesce_offsets([(start, end - start + 1)],
489
limit=0, fudge_factor=0)
490
return self._range_header(list(whole), 0)
492
# Only tail_amount, requested, leave range_header
494
return self._range_header(offsets, tail_amount)
499
def _range_header(ranges, tail_amount):
500
"""Turn a list of bytes ranges into a HTTP Range header value.
502
:param ranges: A list of _CoalescedOffset
503
:param tail_amount: The amount to get from the end of the file.
505
:return: HTTP range header string.
507
At least a non-empty ranges *or* a tail_amount must be
511
for offset in ranges:
512
strings.append('%d-%d' % (offset.start,
513
offset.start + offset.length - 1))
516
strings.append('-%d' % tail_amount)
518
return ','.join(strings)
521
# TODO: May be better located in smart/medium.py with the other
522
# SmartMedium classes
523
class SmartClientHTTPMedium(medium.SmartClientMedium):
525
def __init__(self, http_transport):
526
super(SmartClientHTTPMedium, self).__init__(http_transport.base)
527
# We don't want to create a circular reference between the http
528
# transport and its associated medium. Since the transport will live
529
# longer than the medium, the medium keep only a weak reference to its
531
self._http_transport_ref = weakref.ref(http_transport)
533
def get_request(self):
534
return SmartClientHTTPMediumRequest(self)
536
def should_probe(self):
539
def remote_path_from_transport(self, transport):
540
# Strip the optional 'bzr+' prefix from transport so it will have the
541
# same scheme as self.
542
transport_base = transport.base
543
if transport_base.startswith('bzr+'):
544
transport_base = transport_base[4:]
545
rel_url = urlutils.relative_url(self.base, transport_base)
546
return urllib.unquote(rel_url)
548
def send_http_smart_request(self, bytes):
550
# Get back the http_transport hold by the weak reference
551
t = self._http_transport_ref()
552
code, body_filelike = t._post(bytes)
554
raise InvalidHttpResponse(
555
t._remote_path('.bzr/smart'),
556
'Expected 200 response code, got %r' % (code,))
557
except errors.InvalidHttpResponse, e:
558
raise errors.SmartProtocolError(str(e))
562
# TODO: May be better located in smart/medium.py with the other
563
# SmartMediumRequest classes
564
class SmartClientHTTPMediumRequest(medium.SmartClientMediumRequest):
565
"""A SmartClientMediumRequest that works with an HTTP medium."""
567
def __init__(self, client_medium):
568
medium.SmartClientMediumRequest.__init__(self, client_medium)
571
def _accept_bytes(self, bytes):
572
self._buffer += bytes
574
def _finished_writing(self):
575
data = self._medium.send_http_smart_request(self._buffer)
576
self._response_body = data
578
def _read_bytes(self, count):
579
"""See SmartClientMediumRequest._read_bytes."""
580
return self._response_body.read(count)
582
def _read_line(self):
583
line, excess = medium._get_line(self._response_body.read)
585
raise AssertionError(
586
'_get_line returned excess bytes, but this mediumrequest '
587
'cannot handle excess. (%r)' % (excess,))
590
def _finished_reading(self):
591
"""See SmartClientMediumRequest._finished_reading."""
248
raise TransportNotPossible('http does not support lock_write()')
250
register_transport('http://', HttpTransport)
251
register_transport('https://', HttpTransport)