1
# Copyright (C) 2005, 2006 Canonical Ltd
1
# Copyright (C) 2005 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Base implementation of Transport over http.
19
There are separate implementation modules for each http client implementation.
16
"""Implementation of Transport over http.
19
from bzrlib.transport import Transport, register_transport
20
from bzrlib.errors import (TransportNotPossible, NoSuchFile,
21
NonRelativePath, TransportError, ConnectionError)
22
23
from cStringIO import StringIO
34
from bzrlib.smart import medium
35
from bzrlib.symbol_versioning import (
27
from bzrlib.errors import BzrError, BzrCheckError
28
from bzrlib.branch import Branch
39
29
from bzrlib.trace import mutter
40
from bzrlib.transport import (
46
# TODO: This is not used anymore by HttpTransport_urllib
47
# (extracting the auth info and prompting the user for a password
48
# have been split), only the tests still use it. It should be
49
# deleted and the tests rewritten ASAP to stay in sync.
50
def extract_auth(url, password_manager):
51
"""Extract auth parameters from am HTTP/HTTPS url and add them to the given
52
password manager. Return the url, minus those auth parameters (which
55
assert re.match(r'^(https?)(\+\w+)?://', url), \
56
'invalid absolute url %r' % url
57
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
60
auth, netloc = netloc.split('@', 1)
62
username, password = auth.split(':', 1)
64
username, password = auth, None
66
host = netloc.split(':', 1)[0]
69
username = urllib.unquote(username)
70
if password is not None:
71
password = urllib.unquote(password)
73
password = ui.ui_factory.get_password(
74
prompt='HTTP %(user)s@%(host)s password',
75
user=username, host=host)
76
password_manager.add_password(None, host, username, password)
77
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
81
def _extract_headers(header_text, url):
82
"""Extract the mapping for an rfc2822 header
84
This is a helper function for the test suite and for _pycurl.
85
(urllib already parses the headers for us)
87
In the case that there are multiple headers inside the file,
88
the last one is returned.
90
:param header_text: A string of header information.
91
This expects that the first line of a header will always be HTTP ...
92
:param url: The url we are parsing, so we can raise nice errors
93
:return: mimetools.Message object, which basically acts like a case
94
insensitive dictionary.
97
remaining = header_text
100
raise errors.InvalidHttpResponse(url, 'Empty headers')
103
header_file = StringIO(remaining)
104
first_line = header_file.readline()
105
if not first_line.startswith('HTTP'):
106
if first_header: # The first header *must* start with HTTP
107
raise errors.InvalidHttpResponse(url,
108
'Opening header line did not start with HTTP: %s'
111
break # We are done parsing
113
m = mimetools.Message(header_file)
115
# mimetools.Message parses the first header up to a blank line
116
# So while there is remaining data, it probably means there is
117
# another header to be parsed.
118
# Get rid of any preceeding whitespace, which if it is all whitespace
119
# will get rid of everything.
120
remaining = header_file.read().lstrip()
124
class HttpTransportBase(ConnectedTransport, medium.SmartClientMedium):
125
"""Base class for http implementations.
127
Does URL parsing, etc, but not any network IO.
129
The protocol can be given as e.g. http+urllib://host/ to use a particular
133
# _unqualified_scheme: "http" or "https"
134
# _scheme: may have "+pycurl", etc
136
def __init__(self, base, _from_transport=None):
34
mutter("get_url %s" % url)
35
url_f = urllib2.urlopen(url)
38
class HttpTransportError(TransportError):
41
class HttpTransport(Transport):
42
"""This is the transport agent for http:// access.
44
TODO: Implement pipelined versions of all of the *_multi() functions.
47
def __init__(self, base):
137
48
"""Set the base path where files will be stored."""
138
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
140
raise AssertionError("not a http url: %r" % base)
141
self._unqualified_scheme = proto_match.group(1)
142
impl_name = proto_match.group(2)
144
impl_name = impl_name[1:]
145
self._impl_name = impl_name
146
super(HttpTransportBase, self).__init__(base,
147
_from_transport=_from_transport)
148
# range hint is handled dynamically throughout the life
149
# of the transport object. We start by trying multi-range
150
# requests and if the server returns bogus results, we
151
# retry with single range requests and, finally, we
152
# forget about range if the server really can't
153
# understand. Once acquired, this piece of info is
154
# propagated to clones.
155
if _from_transport is not None:
156
self._range_hint = _from_transport._range_hint
158
self._range_hint = 'multi'
160
def _remote_path(self, relpath):
161
"""Produce absolute path, adjusting protocol."""
162
relative = urlutils.unescape(relpath).encode('utf-8')
163
path = self._combine_paths(self._path, relative)
164
return self._unsplit_url(self._unqualified_scheme,
165
self._user, self._password,
166
self._host, self._port,
49
assert base.startswith('http://') or base.startswith('https://')
50
super(HttpTransport, self).__init__(base)
51
# In the future we might actually connect to the remote host
52
# rather than using get_url
53
# self._connection = None
54
(self._proto, self._host,
55
self._path, self._parameters,
56
self._query, self._fragment) = urlparse.urlparse(self.base)
58
def should_cache(self):
59
"""Return True if the data pulled across should be cached locally.
63
def clone(self, offset=None):
64
"""Return a new HttpTransport with root at self.base + offset
65
For now HttpTransport does not actually connect, so just return
66
a new HttpTransport object.
69
return HttpTransport(self.base)
71
return HttpTransport(self.abspath(offset))
73
def abspath(self, relpath):
74
"""Return the full url to the given relative path.
75
This can be supplied with a string or a list
77
assert isinstance(relpath, basestring)
78
if isinstance(relpath, basestring):
79
relpath_parts = relpath.split('/')
81
# TODO: Don't call this with an array - no magic interfaces
82
relpath_parts = relpath[:]
83
if len(relpath_parts) > 1:
84
if relpath_parts[0] == '':
85
raise ValueError("path %r within branch %r seems to be absolute"
86
% (relpath, self._path))
87
if relpath_parts[-1] == '':
88
raise ValueError("path %r within branch %r seems to be a directory"
89
% (relpath, self._path))
90
basepath = self._path.split('/')
91
if len(basepath) > 0 and basepath[-1] == '':
92
basepath = basepath[:-1]
93
for p in relpath_parts:
95
if len(basepath) == 0:
96
# In most filesystems, a request for the parent
97
# of root, just returns root.
100
elif p == '.' or p == '':
104
# Possibly, we could use urlparse.urljoin() here, but
105
# I'm concerned about when it chooses to strip the last
106
# portion of the path, and when it doesn't.
107
path = '/'.join(basepath)
108
return urlparse.urlunparse((self._proto,
109
self._host, path, '', '', ''))
169
111
def has(self, relpath):
170
raise NotImplementedError("has() is abstract on %r" % self)
172
def get(self, relpath):
112
"""Does the target location exist?
114
TODO: HttpTransport.has() should use a HEAD request,
115
not a full GET request.
117
TODO: This should be changed so that we don't use
118
urllib2 and get an exception, the code path would be
119
cleaner if we just do an http HEAD request, and parse
123
f = get_url(self.abspath(relpath))
124
# Without the read and then close()
125
# we tend to have busy sockets.
129
except urllib2.URLError, e:
134
if e.errno == errno.ENOENT:
136
raise HttpTransportError(orig_error=e)
138
def get(self, relpath, decode=False):
173
139
"""Get the file at the given relative path.
175
141
:param relpath: The relative path to the file
177
code, response_file = self._get(relpath, None)
180
def _get(self, relpath, ranges, tail_amount=0):
181
"""Get a file, or part of a file.
183
:param relpath: Path relative to transport base URL
184
:param ranges: None to get the whole file;
185
or a list of _CoalescedOffset to fetch parts of a file.
186
:param tail_amount: The amount to get from the end of the file.
188
:returns: (http_code, result_file)
190
raise NotImplementedError(self._get)
192
def get_request(self):
193
return SmartClientHTTPMediumRequest(self)
195
def get_smart_medium(self):
196
"""See Transport.get_smart_medium.
198
HttpTransportBase directly implements the minimal interface of
199
SmartMediumClient, so this returns self.
203
def _degrade_range_hint(self, relpath, ranges, exc_info):
204
if self._range_hint == 'multi':
205
self._range_hint = 'single'
206
mutter('Retry "%s" with single range request' % relpath)
207
elif self._range_hint == 'single':
208
self._range_hint = None
209
mutter('Retry "%s" without ranges' % relpath)
211
# We tried all the tricks, but nothing worked. We re-raise original
212
# exception; the 'mutter' calls above will indicate that further
213
# tries were unsuccessful
214
raise exc_info[0], exc_info[1], exc_info[2]
216
def _get_ranges_hinted(self, relpath, ranges):
217
"""Issue a ranged GET request taking server capabilities into account.
219
Depending of the errors returned by the server, we try several GET
220
requests, trying to minimize the data transferred.
222
:param relpath: Path relative to transport base URL
223
:param ranges: None to get the whole file;
224
or a list of _CoalescedOffset to fetch parts of a file.
225
:returns: A file handle containing at least the requested ranges.
232
code, f = self._get(relpath, ranges)
233
except errors.InvalidRange, e:
235
exc_info = sys.exc_info()
236
self._degrade_range_hint(relpath, ranges, exc_info)
240
# _coalesce_offsets is a helper for readv, it try to combine ranges without
241
# degrading readv performances. _bytes_to_read_before_seek is the value
242
# used for the limit parameter and has been tuned for other transports. For
243
# HTTP, the name is inappropriate but the parameter is still useful and
244
# helps reduce the number of chunks in the response. The overhead for a
245
# chunk (headers, length, footer around the data itself is variable but
246
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
247
# the header, some servers (notably Apache) enforce a maximum length for a
248
# header and issue a '400: Bad request' error when too much ranges are
250
_bytes_to_read_before_seek = 128
251
# No limit on the offset number that get combined into one, we are trying
252
# to avoid downloading the whole file.
253
_max_readv_combined = 0
255
def readv(self, relpath, offsets):
256
"""Get parts of the file at the given relative path.
258
:param offsets: A list of (offset, size) tuples.
259
:param return: A list or generator of (offset, data) tuples
261
sorted_offsets = sorted(list(offsets))
262
fudge = self._bytes_to_read_before_seek
263
coalesced = self._coalesce_offsets(sorted_offsets,
264
limit=self._max_readv_combine,
266
coalesced = list(coalesced)
267
mutter('http readv of %s offsets => %s collapsed %s',
268
relpath, len(offsets), len(coalesced))
270
f = self._get_ranges_hinted(relpath, coalesced)
271
for start, size in offsets:
275
f.seek(start, ((start < 0) and 2) or 0)
279
if len(data) != size:
280
raise errors.ShortReadvError(relpath, start, size,
282
except errors.ShortReadvError, e:
283
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
285
# Since the offsets and the ranges may not be in the same
286
# order, we don't try to calculate a restricted single
287
# range encompassing unprocessed offsets.
289
# Note: we replace 'f' here, it may need cleaning one day
290
# before being thrown that way.
291
f = self._get_ranges_hinted(relpath, coalesced)
294
# After one or more tries, we get the data.
298
@deprecated_method(zero_seventeen)
299
def offsets_to_ranges(offsets):
300
"""Turn a list of offsets and sizes into a list of byte ranges.
302
:param offsets: A list of tuples of (start, size). An empty list
304
:return: a list of inclusive byte ranges (start, end)
305
Adjacent ranges will be combined.
307
# Make sure we process sorted offsets
308
offsets = sorted(offsets)
313
for start, size in offsets:
314
end = start + size - 1
316
combined.append([start, end])
317
elif start <= prev_end + 1:
318
combined[-1][1] = end
320
combined.append([start, end])
325
def _post(self, body_bytes):
326
"""POST body_bytes to .bzr/smart on this transport.
328
:returns: (response code, response body file-like object).
330
# TODO: Requiring all the body_bytes to be available at the beginning of
331
# the POST may require large client buffers. It would be nice to have
332
# an interface that allows streaming via POST when possible (and
333
# degrades to a local buffer when not).
334
raise NotImplementedError(self._post)
336
def put_file(self, relpath, f, mode=None):
337
"""Copy the file-like object into the location.
144
return get_url(self.abspath(relpath))
145
except urllib2.HTTPError, e:
147
raise NoSuchFile(msg = "Error retrieving %s: %s"
148
% (self.abspath(relpath), str(e)),
151
except (BzrError, IOError), e:
152
raise ConnectionError(msg = "Error retrieving %s: %s"
153
% (self.abspath(relpath), str(e)),
156
def put(self, relpath, f):
157
"""Copy the file-like or string object into the location.
339
159
:param relpath: Location to put the contents, relative to base.
340
:param f: File-like object.
160
:param f: File-like or string object.
342
raise errors.TransportNotPossible('http PUT not supported')
162
raise TransportNotPossible('http PUT not supported')
344
def mkdir(self, relpath, mode=None):
164
def mkdir(self, relpath):
345
165
"""Create a directory at the given path."""
346
raise errors.TransportNotPossible('http does not support mkdir()')
348
def rmdir(self, relpath):
349
"""See Transport.rmdir."""
350
raise errors.TransportNotPossible('http does not support rmdir()')
352
def append_file(self, relpath, f, mode=None):
166
raise TransportNotPossible('http does not support mkdir()')
168
def append(self, relpath, f):
353
169
"""Append the text in the file-like object into the final
356
raise errors.TransportNotPossible('http does not support append()')
172
raise TransportNotPossible('http does not support append()')
358
174
def copy(self, rel_from, rel_to):
359
175
"""Copy the item at rel_from to the location at rel_to"""
360
raise errors.TransportNotPossible('http does not support copy()')
176
raise TransportNotPossible('http does not support copy()')
362
def copy_to(self, relpaths, other, mode=None, pb=None):
178
def copy_to(self, relpaths, other, pb=None):
363
179
"""Copy a set of entries from self into another Transport.
365
181
:param relpaths: A list/generator of entries to be copied.