101
101
first_path_slash = path.find('/')
102
102
if first_path_slash == -1:
103
103
return len(scheme), None
104
return len(scheme), first_path_slash+len(scheme)+3
104
return len(scheme), first_path_slash+m.start('path')
108
"""Tests whether a URL is in actual fact a URL."""
109
return _url_scheme_re.match(url) is not None
107
112
def join(base, *args):
121
match = _url_scheme_re.match(base)
124
scheme = match.group('scheme')
125
path = match.group('path').split('/')
126
if path[-1:] == ['']:
127
# Strip off a trailing slash
128
# This helps both when we are at the root, and when
129
# 'base' has an extra slash at the end
132
path = base.split('/')
134
if scheme is not None and len(path) >= 1:
136
# the path should be represented as an abs path.
137
# we know this must be absolute because of the presence of a URL scheme.
139
path = [''] + path[1:]
141
# create an empty host, but dont alter the path - this might be a
142
# relative url fragment.
126
scheme_end, path_start = _find_scheme_and_separator(base)
127
if scheme_end is None and path_start is None:
129
elif path_start is None:
130
path_start = len(base)
131
path = base[path_start:]
147
match = _url_scheme_re.match(arg)
150
scheme = match.group('scheme')
151
# this skips .. normalisation, making http://host/../../..
153
path = match.group('path').split('/')
154
# set the host and path according to new absolute URL, discarding
155
# any previous values.
156
# XXX: duplicates mess from earlier in this function. This URL
157
# manipulation code needs some cleaning up.
158
if scheme is not None and len(path) >= 1:
161
# url scheme implies absolute path.
164
# no url scheme we take the path as is.
133
arg_scheme_end, arg_path_start = _find_scheme_and_separator(arg)
134
if arg_scheme_end is None and arg_path_start is None:
136
elif arg_path_start is None:
137
arg_path_start = len(arg)
138
if arg_scheme_end is not None:
140
path = arg[arg_path_start:]
141
scheme_end = arg_scheme_end
142
path_start = arg_path_start
167
path = '/'.join(path)
168
144
path = joinpath(path, arg)
169
path = path.split('/')
170
if remove_root and path[0:1] == ['']:
173
# Remove the leading slash from the path, so long as it isn't also the
174
# trailing slash, which we want to keep if present.
175
if path and path[0] == '' and len(path) > 1:
180
return '/'.join(path)
181
return scheme + '://' + '/'.join(path)
145
return base[:path_start] + path
184
148
def joinpath(base, *args):
278
242
# on non-win32 platform
279
243
# FIXME: It turns out that on nt, ntpath.abspath uses nt._getfullpathname
280
244
# which actually strips trailing space characters.
281
# The worst part is that under linux ntpath.abspath has different
245
# The worst part is that on linux ntpath.abspath has different
282
246
# semantics, since 'nt' is not an available module.
284
248
return 'file:///'
303
267
MIN_ABS_FILEURL_LENGTH = WIN32_MIN_ABS_FILEURL_LENGTH
306
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
270
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,}):(//)?(?P<path>.*)$')
307
271
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
339
303
:param url: Either a hybrid URL or a local path
340
304
:return: A normalized URL which only includes 7-bit ASCII characters.
342
m = _url_scheme_re.match(url)
306
scheme_end, path_start = _find_scheme_and_separator(url)
307
if scheme_end is None:
344
308
return local_path_to_url(url)
345
scheme = m.group('scheme')
346
path = m.group('path')
309
prefix = url[:path_start]
310
path = url[path_start:]
347
311
if not isinstance(url, unicode):
349
313
if c not in _url_safe_characters:
350
314
raise errors.InvalidURL(url, 'URLs can only contain specific'
351
315
' safe characters (not %r)' % c)
352
316
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
353
return str(scheme + '://' + ''.join(path))
317
return str(prefix + ''.join(path))
355
319
# We have a unicode (hybrid) url
356
320
path_chars = list(path)
362
326
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
363
327
path = ''.join(path_chars)
364
328
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
365
return str(scheme + '://' + path)
329
return str(prefix + path)
368
332
def relative_url(base, other):