117
117
join('http://foo', 'bar') => 'http://foo/bar'
118
118
join('http://foo', 'bar', '../baz') => 'http://foo/baz'
122
match = _url_scheme_re.match(base)
120
m = _url_scheme_re.match(base)
125
scheme = match.group('scheme')
126
path = match.group('path').split('/')
123
scheme = m.group('scheme')
124
path = m.group('path').split('/')
127
125
if path[-1:] == ['']:
128
126
# Strip off a trailing slash
129
127
# This helps both when we are at the root, and when
133
131
path = base.split('/')
135
if scheme is not None and len(path) >= 1:
137
# the path should be represented as an abs path.
138
# we know this must be absolute because of the presence of a URL scheme.
140
path = [''] + path[1:]
142
# create an empty host, but dont alter the path - this might be a
143
# relative url fragment.
148
match = _url_scheme_re.match(arg)
134
m = _url_scheme_re.match(arg)
151
scheme = match.group('scheme')
137
scheme = m.group('scheme')
152
138
# this skips .. normalisation, making http://host/../../..
153
139
# be rather strange.
154
path = match.group('path').split('/')
155
# set the host and path according to new absolute URL, discarding
156
# any previous values.
157
# XXX: duplicates mess from earlier in this function. This URL
158
# manipulation code needs some cleaning up.
159
if scheme is not None and len(path) >= 1:
162
# url scheme implies absolute path.
165
# no url scheme we take the path as is.
140
path = m.group('path').split('/')
168
path = '/'.join(path)
169
path = joinpath(path, arg)
170
path = path.split('/')
171
if remove_root and path[0:1] == ['']:
174
# Remove the leading slash from the path, so long as it isn't also the
175
# trailing slash, which we want to keep if present.
176
if path and path[0] == '' and len(path) > 1:
142
for chunk in arg.split('/'):
147
# Don't pop off the host portion
150
raise errors.InvalidURLJoin('Cannot go above root',
180
155
if scheme is None:
181
156
return '/'.join(path)
182
157
return scheme + '://' + '/'.join(path)
185
def joinpath(base, *args):
186
"""Join URL path segments to a URL path segment.
188
This is somewhat like osutils.joinpath, but intended for URLs.
190
XXX: this duplicates some normalisation logic, and also duplicates a lot of
191
path handling logic that already exists in some Transport implementations.
192
We really should try to have exactly one place in the code base responsible
193
for combining paths of URLs.
195
path = base.split('/')
196
if len(path) > 1 and path[-1] == '':
197
#If the path ends in a trailing /, remove it.
200
if arg.startswith('/'):
202
for chunk in arg.split('/'):
207
raise errors.InvalidURLJoin('Cannot go above root',
215
return '/'.join(path)
218
160
# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
219
161
def _posix_local_path_from_url(url):
220
162
"""Convert a url like file:///path/to/foo into /path/to/foo"""
292
234
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
293
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
296
def _unescape_safe_chars(matchobj):
297
"""re.sub callback to convert hex-escapes to plain characters (if safe).
299
e.g. '%7E' will be converted to '~'.
301
hex_digits = matchobj.group(0)[1:]
302
char = chr(int(hex_digits, 16))
303
if char in _url_dont_escape_characters:
306
return matchobj.group(0).upper()
309
237
def normalize_url(url):
310
238
"""Make sure that a path string is in fully normalized URL form.
312
This handles URLs which have unicode characters, spaces,
240
This handles URLs which have unicode characters, spaces,
313
241
special characters, etc.
315
243
It has two basic modes of operation, depending on whether the
328
256
m = _url_scheme_re.match(url)
330
258
return local_path_to_url(url)
331
scheme = m.group('scheme')
332
path = m.group('path')
333
259
if not isinstance(url, unicode):
335
261
if c not in _url_safe_characters:
336
262
raise errors.InvalidURL(url, 'URLs can only contain specific'
337
263
' safe characters (not %r)' % c)
338
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
339
return str(scheme + '://' + ''.join(path))
341
265
# We have a unicode (hybrid) url
342
path_chars = list(path)
266
scheme = m.group('scheme')
267
path = list(m.group('path'))
344
for i in xrange(len(path_chars)):
345
if path_chars[i] not in _url_safe_characters:
346
chars = path_chars[i].encode('utf-8')
347
path_chars[i] = ''.join(
348
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
349
path = ''.join(path_chars)
350
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
351
return str(scheme + '://' + path)
269
for i in xrange(len(path)):
270
if path[i] not in _url_safe_characters:
271
chars = path[i].encode('utf-8')
272
path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
273
return str(scheme + '://' + ''.join(path))
354
276
def relative_url(base, other):
534
456
#These entries get mapped to themselves
535
457
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
537
# These characters shouldn't be percent-encoded, and it's always safe to
538
# unencode them if they are.
539
_url_dont_escape_characters = set(
540
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
541
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
542
"0123456789" # Numbers
543
"-._~" # Unreserved characters
546
459
# These characters should not be escaped
547
460
_url_safe_characters = set(
548
461
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
608
521
# Otherwise take the url decoded one
610
523
return u'/'.join(res)
613
def derive_to_location(from_location):
614
"""Derive a TO_LOCATION given a FROM_LOCATION.
616
The normal case is a FROM_LOCATION of http://foo/bar => bar.
617
The Right Thing for some logical destinations may differ though
618
because no / may be present at all. In that case, the result is
619
the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar.
620
This latter case also applies when a Windows drive
621
is used without a path, e.g. c:foo-bar => foo-bar.
622
If no /, path separator or : is found, the from_location is returned.
624
if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0:
625
return os.path.basename(from_location.rstrip("/\\"))
627
sep = from_location.find(":")
629
return from_location[sep+1:]