19
19
"""A collection of function for handling URL operations."""
22
from posixpath import split as _posix_split, normpath as _posix_normpath
25
from bzrlib.lazy_import import lazy_import
26
lazy_import(globals(), """
27
from posixpath import split as _posix_split, normpath as _posix_normpath
27
import bzrlib.errors as errors
31
38
def basename(url, exclude_trailing_slash=True):
70
77
This assumes that both paths are already fully specified file:// URLs.
72
assert len(base) >= MIN_ABS_FILEURL_LENGTH, ('Length of base must be equal or'
73
' exceed the platform minimum url length (which is %d)' %
74
MIN_ABS_FILEURL_LENGTH)
79
if len(base) < MIN_ABS_FILEURL_LENGTH:
80
raise ValueError('Length of base must be equal or'
81
' exceed the platform minimum url length (which is %d)' %
82
MIN_ABS_FILEURL_LENGTH)
76
83
base = local_path_from_url(base)
77
84
path = local_path_from_url(path)
78
return escape(bzrlib.osutils.relpath(base, path))
85
return escape(osutils.relpath(base, path))
81
88
def _find_scheme_and_separator(url):
111
118
join('http://foo', 'bar') => 'http://foo/bar'
112
119
join('http://foo', 'bar', '../baz') => 'http://foo/baz'
114
m = _url_scheme_re.match(base)
123
match = _url_scheme_re.match(base)
117
scheme = m.group('scheme')
118
path = m.group('path').split('/')
126
scheme = match.group('scheme')
127
path = match.group('path').split('/')
119
128
if path[-1:] == ['']:
120
129
# Strip off a trailing slash
121
130
# This helps both when we are at the root, and when
125
134
path = base.split('/')
136
if scheme is not None and len(path) >= 1:
138
# the path should be represented as an abs path.
139
# we know this must be absolute because of the presence of a URL scheme.
141
path = [''] + path[1:]
143
# create an empty host, but dont alter the path - this might be a
144
# relative url fragment.
128
m = _url_scheme_re.match(arg)
149
match = _url_scheme_re.match(arg)
131
scheme = m.group('scheme')
132
path = m.group('path').split('/')
152
scheme = match.group('scheme')
153
# this skips .. normalisation, making http://host/../../..
155
path = match.group('path').split('/')
156
# set the host and path according to new absolute URL, discarding
157
# any previous values.
158
# XXX: duplicates mess from earlier in this function. This URL
159
# manipulation code needs some cleaning up.
160
if scheme is not None and len(path) >= 1:
163
# url scheme implies absolute path.
166
# no url scheme we take the path as is.
134
for chunk in arg.split('/'):
139
# Don't pop off the host portion
142
raise errors.InvalidURLJoin('Cannot go above root',
169
path = '/'.join(path)
170
path = joinpath(path, arg)
171
path = path.split('/')
172
if remove_root and path[0:1] == ['']:
175
# Remove the leading slash from the path, so long as it isn't also the
176
# trailing slash, which we want to keep if present.
177
if path and path[0] == '' and len(path) > 1:
147
181
if scheme is None:
148
182
return '/'.join(path)
149
183
return scheme + '://' + '/'.join(path)
186
def joinpath(base, *args):
187
"""Join URL path segments to a URL path segment.
189
This is somewhat like osutils.joinpath, but intended for URLs.
191
XXX: this duplicates some normalisation logic, and also duplicates a lot of
192
path handling logic that already exists in some Transport implementations.
193
We really should try to have exactly one place in the code base responsible
194
for combining paths of URLs.
196
path = base.split('/')
197
if len(path) > 1 and path[-1] == '':
198
#If the path ends in a trailing /, remove it.
201
if arg.startswith('/'):
203
for chunk in arg.split('/'):
208
raise errors.InvalidURLJoin('Cannot go above root',
216
return '/'.join(path)
152
219
# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
153
220
def _posix_local_path_from_url(url):
154
221
"""Convert a url like file:///path/to/foo into /path/to/foo"""
166
233
# importing directly from posixpath allows us to test this
167
234
# on non-posix platforms
168
235
return 'file://' + escape(_posix_normpath(
169
bzrlib.osutils._posix_abspath(path)))
236
osutils._posix_abspath(path)))
172
239
def _win32_local_path_from_url(url):
173
240
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
174
if not url.startswith('file:///'):
175
raise errors.InvalidURL(url, 'local urls must start with file:///')
241
if not url.startswith('file://'):
242
raise errors.InvalidURL(url, 'local urls must start with file:///, '
243
'UNC path urls must start with file://')
176
244
# We strip off all 3 slashes
177
win32_url = url[len('file:///'):]
178
if (win32_url[0] not in ('abcdefghijklmnopqrstuvwxyz'
245
win32_url = url[len('file:'):]
246
# check for UNC path: //HOST/path
247
if not win32_url.startswith('///'):
248
if (win32_url[2] == '/'
249
or win32_url[3] in '|:'):
250
raise errors.InvalidURL(url, 'Win32 UNC path urls'
251
' have form file://HOST/path')
252
return unescape(win32_url)
253
# usual local path with drive letter
254
if (win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
179
255
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
180
or win32_url[1] not in '|:'
181
or win32_url[2] != '/'):
256
or win32_url[4] not in '|:'
257
or win32_url[5] != '/'):
182
258
raise errors.InvalidURL(url, 'Win32 file urls start with'
183
259
' file:///x:/, where x is a valid drive letter')
184
return win32_url[0].upper() + u':' + unescape(win32_url[2:])
260
return win32_url[3].upper() + u':' + unescape(win32_url[5:])
187
263
def _win32_local_path_to_url(path):
195
271
# which actually strips trailing space characters.
196
272
# The worst part is that under linux ntpath.abspath has different
197
273
# semantics, since 'nt' is not an available module.
198
win32_path = bzrlib.osutils._nt_normpath(
199
bzrlib.osutils._win32_abspath(path)).replace('\\', '/')
200
return 'file:///' + win32_path[0].upper() + ':' + escape(win32_path[2:])
274
win32_path = osutils._win32_abspath(path)
275
# check for UNC path \\HOST\path
276
if win32_path.startswith('//'):
277
return 'file:' + escape(win32_path)
278
return ('file:///' + str(win32_path[0].upper()) + ':' +
279
escape(win32_path[2:]))
203
282
local_path_to_url = _posix_local_path_to_url
215
294
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
295
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
298
def _unescape_safe_chars(matchobj):
299
"""re.sub callback to convert hex-escapes to plain characters (if safe).
301
e.g. '%7E' will be converted to '~'.
303
hex_digits = matchobj.group(0)[1:]
304
char = chr(int(hex_digits, 16))
305
if char in _url_dont_escape_characters:
308
return matchobj.group(0).upper()
218
311
def normalize_url(url):
219
312
"""Make sure that a path string is in fully normalized URL form.
221
This handles URLs which have unicode characters, spaces,
314
This handles URLs which have unicode characters, spaces,
222
315
special characters, etc.
224
317
It has two basic modes of operation, depending on whether the
237
330
m = _url_scheme_re.match(url)
239
332
return local_path_to_url(url)
333
scheme = m.group('scheme')
334
path = m.group('path')
240
335
if not isinstance(url, unicode):
242
337
if c not in _url_safe_characters:
243
338
raise errors.InvalidURL(url, 'URLs can only contain specific'
244
339
' safe characters (not %r)' % c)
340
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
341
return str(scheme + '://' + ''.join(path))
246
343
# We have a unicode (hybrid) url
247
scheme = m.group('scheme')
248
path = list(m.group('path'))
344
path_chars = list(path)
250
for i in xrange(len(path)):
251
if path[i] not in _url_safe_characters:
252
chars = path[i].encode('utf-8')
253
path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
254
return scheme + '://' + ''.join(path)
346
for i in xrange(len(path_chars)):
347
if path_chars[i] not in _url_safe_characters:
348
chars = path_chars[i].encode('utf-8')
349
path_chars[i] = ''.join(
350
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
351
path = ''.join(path_chars)
352
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
353
return str(scheme + '://' + path)
257
356
def relative_url(base, other):
273
372
other_scheme = other[:other_first_slash]
274
373
if base_scheme != other_scheme:
375
elif sys.platform == 'win32' and base_scheme == 'file://':
376
base_drive = base[base_first_slash+1:base_first_slash+3]
377
other_drive = other[other_first_slash+1:other_first_slash+3]
378
if base_drive != other_drive:
277
381
base_path = base[base_first_slash+1:]
278
382
other_path = other[other_first_slash+1:]
437
541
#These entries get mapped to themselves
438
542
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
544
# These characters shouldn't be percent-encoded, and it's always safe to
545
# unencode them if they are.
546
_url_dont_escape_characters = set(
547
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
548
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
549
"0123456789" # Numbers
550
"-._~" # Unreserved characters
440
553
# These characters should not be escaped
441
_url_safe_characters = set('abcdefghijklmnopqrstuvwxyz'
442
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
554
_url_safe_characters = set(
555
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
556
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
557
"0123456789" # Numbers
558
"_.-!~*'()" # Unreserved characters
559
"/;?:@&=+$," # Reserved characters
560
"%#" # Extra reserved characters
447
563
def unescape_for_display(url, encoding):
448
564
"""Decode what you can for a URL, so that we get a nice looking path.
499
616
# Otherwise take the url decoded one
501
618
return u'/'.join(res)
621
def derive_to_location(from_location):
622
"""Derive a TO_LOCATION given a FROM_LOCATION.
624
The normal case is a FROM_LOCATION of http://foo/bar => bar.
625
The Right Thing for some logical destinations may differ though
626
because no / may be present at all. In that case, the result is
627
the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar.
628
This latter case also applies when a Windows drive
629
is used without a path, e.g. c:foo-bar => foo-bar.
630
If no /, path separator or : is found, the from_location is returned.
632
if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0:
633
return os.path.basename(from_location.rstrip("/\\"))
635
sep = from_location.find(":")
637
return from_location[sep+1:]
642
def _is_absolute(url):
643
return (osutils.pathjoin('/foo', url) == url)
646
def rebase_url(url, old_base, new_base):
647
"""Convert a relative path from an old base URL to a new base URL.
649
The result will be a relative path.
650
Absolute paths and full URLs are returned unaltered.
652
scheme, separator = _find_scheme_and_separator(url)
653
if scheme is not None:
655
if _is_absolute(url):
657
old_parsed = urlparse.urlparse(old_base)
658
new_parsed = urlparse.urlparse(new_base)
659
if (old_parsed[:2]) != (new_parsed[:2]):
660
raise errors.InvalidRebaseURLs(old_base, new_base)
661
return determine_relative_path(new_parsed[2],
662
osutils.pathjoin(old_parsed[2], url))
665
def determine_relative_path(from_path, to_path):
666
"""Determine a relative path from from_path to to_path."""
667
from_segments = osutils.splitpath(from_path)
668
to_segments = osutils.splitpath(to_path)
670
for count, (from_element, to_element) in enumerate(zip(from_segments,
672
if from_element != to_element:
676
unique_from = from_segments[count:]
677
unique_to = to_segments[count:]
678
segments = (['..'] * len(unique_from) + unique_to)
679
if len(segments) == 0:
681
return osutils.pathjoin(*segments)