1
# Copyright (C) 2006, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""A collection of function for handling URL operations."""
23
from bzrlib.lazy_import import lazy_import
24
lazy_import(globals(), """
25
from posixpath import split as _posix_split, normpath as _posix_normpath
36
def basename(url, exclude_trailing_slash=True):
37
"""Return the last component of a URL.
39
:param url: The URL in question
40
:param exclude_trailing_slash: If the url looks like "path/to/foo/"
41
ignore the final slash and return 'foo' rather than ''
42
:return: Just the final component of the URL. This can return ''
43
if you don't exclude_trailing_slash, or if you are at the
46
return split(url, exclude_trailing_slash=exclude_trailing_slash)[1]
49
def dirname(url, exclude_trailing_slash=True):
50
"""Return the parent directory of the given path.
52
:param url: Relative or absolute URL
53
:param exclude_trailing_slash: Remove a final slash
54
(treat http://host/foo/ as http://host/foo, but
55
http://host/ stays http://host/)
56
:return: Everything in the URL except the last path chunk
58
# TODO: jam 20060502 This was named dirname to be consistent
59
# with the os functions, but maybe "parent" would be better
60
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
64
"""Escape relpath to be a valid url."""
65
if isinstance(relpath, unicode):
66
relpath = relpath.encode('utf-8')
67
# After quoting and encoding, the path should be perfectly
68
# safe as a plain ASCII string, str() just enforces this
69
return str(urllib.quote(relpath, safe='/~'))
72
def file_relpath(base, path):
73
"""Compute just the relative sub-portion of a url
75
This assumes that both paths are already fully specified file:// URLs.
77
if len(base) < MIN_ABS_FILEURL_LENGTH:
78
raise ValueError('Length of base (%r) must equal or'
79
' exceed the platform minimum url length (which is %d)' %
80
(base, MIN_ABS_FILEURL_LENGTH))
81
base = local_path_from_url(base)
82
path = local_path_from_url(path)
83
return escape(osutils.relpath(base, path))
86
def _find_scheme_and_separator(url):
87
"""Find the scheme separator (://) and the first path separator
89
This is just a helper functions for other path utilities.
90
It could probably be replaced by urlparse
92
m = _url_scheme_re.match(url)
96
scheme = m.group('scheme')
97
path = m.group('path')
99
# Find the path separating slash
100
# (first slash after the ://)
101
first_path_slash = path.find('/')
102
if first_path_slash == -1:
103
return len(scheme), None
104
return len(scheme), first_path_slash+len(scheme)+3
107
def join(base, *args):
108
"""Create a URL by joining sections.
110
This will normalize '..', assuming that paths are absolute
111
(it assumes no symlinks in either path)
113
If any of *args is an absolute URL, it will be treated correctly.
115
join('http://foo', 'http://bar') => 'http://bar'
116
join('http://foo', 'bar') => 'http://foo/bar'
117
join('http://foo', 'bar', '../baz') => 'http://foo/baz'
121
match = _url_scheme_re.match(base)
124
scheme = match.group('scheme')
125
path = match.group('path').split('/')
126
if path[-1:] == ['']:
127
# Strip off a trailing slash
128
# This helps both when we are at the root, and when
129
# 'base' has an extra slash at the end
132
path = base.split('/')
134
if scheme is not None and len(path) >= 1:
136
# the path should be represented as an abs path.
137
# we know this must be absolute because of the presence of a URL scheme.
139
path = [''] + path[1:]
141
# create an empty host, but dont alter the path - this might be a
142
# relative url fragment.
147
match = _url_scheme_re.match(arg)
150
scheme = match.group('scheme')
151
# this skips .. normalisation, making http://host/../../..
153
path = match.group('path').split('/')
154
# set the host and path according to new absolute URL, discarding
155
# any previous values.
156
# XXX: duplicates mess from earlier in this function. This URL
157
# manipulation code needs some cleaning up.
158
if scheme is not None and len(path) >= 1:
161
# url scheme implies absolute path.
164
# no url scheme we take the path as is.
167
path = '/'.join(path)
168
path = joinpath(path, arg)
169
path = path.split('/')
170
if remove_root and path[0:1] == ['']:
173
# Remove the leading slash from the path, so long as it isn't also the
174
# trailing slash, which we want to keep if present.
175
if path and path[0] == '' and len(path) > 1:
180
return '/'.join(path)
181
return scheme + '://' + '/'.join(path)
184
def joinpath(base, *args):
185
"""Join URL path segments to a URL path segment.
187
This is somewhat like osutils.joinpath, but intended for URLs.
189
XXX: this duplicates some normalisation logic, and also duplicates a lot of
190
path handling logic that already exists in some Transport implementations.
191
We really should try to have exactly one place in the code base responsible
192
for combining paths of URLs.
194
path = base.split('/')
195
if len(path) > 1 and path[-1] == '':
196
#If the path ends in a trailing /, remove it.
199
if arg.startswith('/'):
201
for chunk in arg.split('/'):
206
raise errors.InvalidURLJoin('Cannot go above root',
214
return '/'.join(path)
217
# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
218
def _posix_local_path_from_url(url):
219
"""Convert a url like file:///path/to/foo into /path/to/foo"""
220
if not url.startswith('file:///'):
221
raise errors.InvalidURL(url, 'local urls must start with file:///')
222
# We only strip off 2 slashes
223
return unescape(url[len('file://'):])
226
def _posix_local_path_to_url(path):
227
"""Convert a local path like ./foo into a URL like file:///path/to/foo
229
This also handles transforming escaping unicode characters, etc.
231
# importing directly from posixpath allows us to test this
232
# on non-posix platforms
233
return 'file://' + escape(_posix_normpath(
234
osutils._posix_abspath(path)))
237
def _win32_local_path_from_url(url):
238
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
239
if not url.startswith('file://'):
240
raise errors.InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
# We strip off all 3 slashes
243
win32_url = url[len('file:'):]
244
# check for UNC path: //HOST/path
245
if not win32_url.startswith('///'):
246
if (win32_url[2] == '/'
247
or win32_url[3] in '|:'):
248
raise errors.InvalidURL(url, 'Win32 UNC path urls'
249
' have form file://HOST/path')
250
return unescape(win32_url)
252
# allow empty paths so we can serve all roots
253
if win32_url == '///':
256
# usual local path with drive letter
257
if (win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
258
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
259
or win32_url[4] not in '|:'
260
or win32_url[5] != '/'):
261
raise errors.InvalidURL(url, 'Win32 file urls start with'
262
' file:///x:/, where x is a valid drive letter')
263
return win32_url[3].upper() + u':' + unescape(win32_url[5:])
266
def _win32_local_path_to_url(path):
267
"""Convert a local path like ./foo into a URL like file:///C:/path/to/foo
269
This also handles transforming escaping unicode characters, etc.
271
# importing directly from ntpath allows us to test this
272
# on non-win32 platform
273
# FIXME: It turns out that on nt, ntpath.abspath uses nt._getfullpathname
274
# which actually strips trailing space characters.
275
# The worst part is that under linux ntpath.abspath has different
276
# semantics, since 'nt' is not an available module.
280
win32_path = osutils._win32_abspath(path)
281
# check for UNC path \\HOST\path
282
if win32_path.startswith('//'):
283
return 'file:' + escape(win32_path)
284
return ('file:///' + str(win32_path[0].upper()) + ':' +
285
escape(win32_path[2:]))
288
local_path_to_url = _posix_local_path_to_url
289
local_path_from_url = _posix_local_path_from_url
290
MIN_ABS_FILEURL_LENGTH = len('file:///')
291
WIN32_MIN_ABS_FILEURL_LENGTH = len('file:///C:/')
293
if sys.platform == 'win32':
294
local_path_to_url = _win32_local_path_to_url
295
local_path_from_url = _win32_local_path_from_url
297
MIN_ABS_FILEURL_LENGTH = WIN32_MIN_ABS_FILEURL_LENGTH
300
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
301
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
304
def _unescape_safe_chars(matchobj):
305
"""re.sub callback to convert hex-escapes to plain characters (if safe).
307
e.g. '%7E' will be converted to '~'.
309
hex_digits = matchobj.group(0)[1:]
310
char = chr(int(hex_digits, 16))
311
if char in _url_dont_escape_characters:
314
return matchobj.group(0).upper()
317
def normalize_url(url):
318
"""Make sure that a path string is in fully normalized URL form.
320
This handles URLs which have unicode characters, spaces,
321
special characters, etc.
323
It has two basic modes of operation, depending on whether the
324
supplied string starts with a url specifier (scheme://) or not.
325
If it does not have a specifier it is considered a local path,
326
and will be converted into a file:/// url. Non-ascii characters
327
will be encoded using utf-8.
328
If it does have a url specifier, it will be treated as a "hybrid"
329
URL. Basically, a URL that should have URL special characters already
330
escaped (like +?&# etc), but may have unicode characters, etc
331
which would not be valid in a real URL.
333
:param url: Either a hybrid URL or a local path
334
:return: A normalized URL which only includes 7-bit ASCII characters.
336
m = _url_scheme_re.match(url)
338
return local_path_to_url(url)
339
scheme = m.group('scheme')
340
path = m.group('path')
341
if not isinstance(url, unicode):
343
if c not in _url_safe_characters:
344
raise errors.InvalidURL(url, 'URLs can only contain specific'
345
' safe characters (not %r)' % c)
346
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
347
return str(scheme + '://' + ''.join(path))
349
# We have a unicode (hybrid) url
350
path_chars = list(path)
352
for i in xrange(len(path_chars)):
353
if path_chars[i] not in _url_safe_characters:
354
chars = path_chars[i].encode('utf-8')
355
path_chars[i] = ''.join(
356
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
357
path = ''.join(path_chars)
358
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
359
return str(scheme + '://' + path)
362
def relative_url(base, other):
363
"""Return a path to other from base.
365
If other is unrelated to base, return other. Else return a relative path.
366
This assumes no symlinks as part of the url.
368
dummy, base_first_slash = _find_scheme_and_separator(base)
369
if base_first_slash is None:
372
dummy, other_first_slash = _find_scheme_and_separator(other)
373
if other_first_slash is None:
376
# this takes care of differing schemes or hosts
377
base_scheme = base[:base_first_slash]
378
other_scheme = other[:other_first_slash]
379
if base_scheme != other_scheme:
381
elif sys.platform == 'win32' and base_scheme == 'file://':
382
base_drive = base[base_first_slash+1:base_first_slash+3]
383
other_drive = other[other_first_slash+1:other_first_slash+3]
384
if base_drive != other_drive:
387
base_path = base[base_first_slash+1:]
388
other_path = other[other_first_slash+1:]
390
if base_path.endswith('/'):
391
base_path = base_path[:-1]
393
base_sections = base_path.split('/')
394
other_sections = other_path.split('/')
396
if base_sections == ['']:
398
if other_sections == ['']:
402
for b, o in zip(base_sections, other_sections):
405
output_sections.append(b)
407
match_len = len(output_sections)
408
output_sections = ['..' for x in base_sections[match_len:]]
409
output_sections.extend(other_sections[match_len:])
411
return "/".join(output_sections) or "."
414
def _win32_extract_drive_letter(url_base, path):
415
"""On win32 the drive letter needs to be added to the url base."""
416
# Strip off the drive letter
417
# path is currently /C:/foo
418
if len(path) < 3 or path[2] not in ':|' or path[3] != '/':
419
raise errors.InvalidURL(url_base + path,
420
'win32 file:/// paths need a drive letter')
421
url_base += path[0:3] # file:// + /C:
422
path = path[3:] # /foo
423
return url_base, path
426
def split(url, exclude_trailing_slash=True):
427
"""Split a URL into its parent directory and a child directory.
429
:param url: A relative or absolute URL
430
:param exclude_trailing_slash: Strip off a final '/' if it is part
431
of the path (but not if it is part of the protocol specification)
433
:return: (parent_url, child_dir). child_dir may be the empty string if we're at
436
scheme_loc, first_path_slash = _find_scheme_and_separator(url)
438
if first_path_slash is None:
439
# We have either a relative path, or no separating slash
440
if scheme_loc is None:
442
if exclude_trailing_slash and url.endswith('/'):
444
return _posix_split(url)
446
# Scheme with no path
449
# We have a fully defined path
450
url_base = url[:first_path_slash] # http://host, file://
451
path = url[first_path_slash:] # /file/foo
453
if sys.platform == 'win32' and url.startswith('file:///'):
454
# Strip off the drive letter
455
# url_base is currently file://
456
# path is currently /C:/foo
457
url_base, path = _win32_extract_drive_letter(url_base, path)
458
# now it should be file:///C: and /foo
460
if exclude_trailing_slash and len(path) > 1 and path.endswith('/'):
462
head, tail = _posix_split(path)
463
return url_base + head, tail
466
def _win32_strip_local_trailing_slash(url):
467
"""Strip slashes after the drive letter"""
468
if len(url) > WIN32_MIN_ABS_FILEURL_LENGTH:
474
def strip_trailing_slash(url):
475
"""Strip trailing slash, except for root paths.
477
The definition of 'root path' is platform-dependent.
478
This assumes that all URLs are valid netloc urls, such that they
481
It searches for ://, and then refuses to remove the next '/'.
482
It can also handle relative paths
484
path/to/foo => path/to/foo
485
path/to/foo/ => path/to/foo
486
http://host/path/ => http://host/path
487
http://host/path => http://host/path
488
http://host/ => http://host/
490
file:///foo/ => file:///foo
491
# This is unique on win32 platforms, and is the only URL
492
# format which does it differently.
493
file:///c|/ => file:///c:/
495
if not url.endswith('/'):
498
if sys.platform == 'win32' and url.startswith('file://'):
499
return _win32_strip_local_trailing_slash(url)
501
scheme_loc, first_path_slash = _find_scheme_and_separator(url)
502
if scheme_loc is None:
503
# This is a relative path, as it has no scheme
504
# so just chop off the last character
507
if first_path_slash is None or first_path_slash == len(url)-1:
508
# Don't chop off anything if the only slash is the path
516
"""Unescape relpath from url format.
518
This returns a Unicode path from a URL
520
# jam 20060427 URLs are supposed to be ASCII only strings
521
# If they are passed in as unicode, urllib.unquote
522
# will return a UNICODE string, which actually contains
523
# utf-8 bytes. So we have to ensure that they are
524
# plain ASCII strings, or the final .decode will
525
# try to encode the UNICODE => ASCII, and then decode
529
except UnicodeError, e:
530
raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
532
unquoted = urllib.unquote(url)
534
unicode_path = unquoted.decode('utf-8')
535
except UnicodeError, e:
536
raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
540
# These are characters that if escaped, should stay that way
541
_no_decode_chars = ';/?:@&=+$,#'
542
_no_decode_ords = [ord(c) for c in _no_decode_chars]
543
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
544
+ ['%02X' % o for o in _no_decode_ords])
545
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
546
+ [('%02X' % o, chr(o)) for o in range(256)]))
547
#These entries get mapped to themselves
548
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
550
# These characters shouldn't be percent-encoded, and it's always safe to
551
# unencode them if they are.
552
_url_dont_escape_characters = set(
553
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
554
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
555
"0123456789" # Numbers
556
"-._~" # Unreserved characters
559
# These characters should not be escaped
560
_url_safe_characters = set(
561
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
562
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
563
"0123456789" # Numbers
564
"_.-!~*'()" # Unreserved characters
565
"/;?:@&=+$," # Reserved characters
566
"%#" # Extra reserved characters
569
def unescape_for_display(url, encoding):
570
"""Decode what you can for a URL, so that we get a nice looking path.
572
This will turn file:// urls into local paths, and try to decode
573
any portions of a http:// style url that it can.
575
Any sections of the URL which can't be represented in the encoding or
576
need to stay as escapes are left alone.
578
:param url: A 7-bit ASCII URL
579
:param encoding: The final output encoding
581
:return: A unicode string which can be safely encoded into the
585
raise ValueError('you cannot specify None for the display encoding')
586
if url.startswith('file://'):
588
path = local_path_from_url(url)
589
path.encode(encoding)
594
# Split into sections to try to decode utf-8
596
for i in xrange(1, len(res)):
597
escaped_chunks = res[i].split('%')
598
for j in xrange(1, len(escaped_chunks)):
599
item = escaped_chunks[j]
601
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
603
# Put back the percent symbol
604
escaped_chunks[j] = '%' + item
605
except UnicodeDecodeError:
606
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
607
unescaped = ''.join(escaped_chunks)
609
decoded = unescaped.decode('utf-8')
610
except UnicodeDecodeError:
611
# If this path segment cannot be properly utf-8 decoded
612
# after doing unescaping we will just leave it alone
616
decoded.encode(encoding)
617
except UnicodeEncodeError:
618
# If this chunk cannot be encoded in the local
619
# encoding, then we should leave it alone
622
# Otherwise take the url decoded one
624
return u'/'.join(res)
627
def derive_to_location(from_location):
628
"""Derive a TO_LOCATION given a FROM_LOCATION.
630
The normal case is a FROM_LOCATION of http://foo/bar => bar.
631
The Right Thing for some logical destinations may differ though
632
because no / may be present at all. In that case, the result is
633
the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar.
634
This latter case also applies when a Windows drive
635
is used without a path, e.g. c:foo-bar => foo-bar.
636
If no /, path separator or : is found, the from_location is returned.
638
if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0:
639
return os.path.basename(from_location.rstrip("/\\"))
641
sep = from_location.find(":")
643
return from_location[sep+1:]
648
def _is_absolute(url):
649
return (osutils.pathjoin('/foo', url) == url)
652
def rebase_url(url, old_base, new_base):
653
"""Convert a relative path from an old base URL to a new base URL.
655
The result will be a relative path.
656
Absolute paths and full URLs are returned unaltered.
658
scheme, separator = _find_scheme_and_separator(url)
659
if scheme is not None:
661
if _is_absolute(url):
663
old_parsed = urlparse.urlparse(old_base)
664
new_parsed = urlparse.urlparse(new_base)
665
if (old_parsed[:2]) != (new_parsed[:2]):
666
raise errors.InvalidRebaseURLs(old_base, new_base)
667
return determine_relative_path(new_parsed[2],
668
join(old_parsed[2], url))
671
def determine_relative_path(from_path, to_path):
672
"""Determine a relative path from from_path to to_path."""
673
from_segments = osutils.splitpath(from_path)
674
to_segments = osutils.splitpath(to_path)
676
for count, (from_element, to_element) in enumerate(zip(from_segments,
678
if from_element != to_element:
682
unique_from = from_segments[count:]
683
unique_to = to_segments[count:]
684
segments = (['..'] * len(unique_from) + unique_to)
685
if len(segments) == 0:
687
return osutils.pathjoin(*segments)
692
"""Extract the server address, the credentials and the path from the url.
694
user, password, host and path should be quoted if they contain reserved
697
:param url: an quoted url
699
:return: (scheme, user, password, host, port, path) tuple, all fields
702
if isinstance(url, unicode):
703
raise errors.InvalidURL('should be ascii:\n%r' % url)
704
url = url.encode('utf-8')
705
(scheme, netloc, path, params,
706
query, fragment) = urlparse.urlparse(url, allow_fragments=False)
707
user = password = host = port = None
709
user, host = netloc.rsplit('@', 1)
711
user, password = user.split(':', 1)
712
password = urllib.unquote(password)
713
user = urllib.unquote(user)
717
if ':' in host and not (host[0] == '[' and host[-1] == ']'): #there *is* port
718
host, port = host.rsplit(':',1)
722
raise errors.InvalidURL('invalid port number %s in url:\n%s' %
724
if host != "" and host[0] == '[' and host[-1] == ']': #IPv6
727
host = urllib.unquote(host)
728
path = urllib.unquote(path)
730
return (scheme, user, password, host, port, path)