~bzr-pqm/bzr/bzr.dev : contents of bzrlib/urlutils.py at revision 4661

~bzr-pqm/bzr/bzr.dev : (revision 4661)

3873.3.1 by Martin Pool Move Transport._split_url to urlutils, and ad a simple test	1	# Copyright (C) 2006, 2008 Canonical Ltd
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	16
	17	"""A collection of function for handling URL operations."""
	18
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	19	import os
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	20	import re
	21	import sys
1996.3.12 by John Arbash Meinel Change how 'revision' is imported to avoid problems later	22
	23	from bzrlib.lazy_import import lazy_import
	24	lazy_import(globals(), """
	25	from posixpath import split as _posix_split, normpath as _posix_normpath
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	26	import urllib
3242.3.26 by Aaron Bentley Implement rebase_url	27	import urlparse
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	28
1996.3.12 by John Arbash Meinel Change how 'revision' is imported to avoid problems later	29	from bzrlib import (
	30	errors,
	31	osutils,
	32	)
	33	""")
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	34
	35
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	36	def basename(url, exclude_trailing_slash=True):
	37	"""Return the last component of a URL.
	38
	39	:param url: The URL in question
	40	:param exclude_trailing_slash: If the url looks like "path/to/foo/"
	41	ignore the final slash and return 'foo' rather than ''
	42	:return: Just the final component of the URL. This can return ''
	43	if you don't exclude_trailing_slash, or if you are at the
	44	root of the URL.
	45	"""
	46	return split(url, exclude_trailing_slash=exclude_trailing_slash)[1]
	47
	48
	49	def dirname(url, exclude_trailing_slash=True):
	50	"""Return the parent directory of the given path.
	51
	52	:param url: Relative or absolute URL
	53	:param exclude_trailing_slash: Remove a final slash
	54	(treat http://host/foo/ as http://host/foo, but
	55	http://host/ stays http://host/)
	56	:return: Everything in the URL except the last path chunk
	57	"""
	58	# TODO: jam 20060502 This was named dirname to be consistent
	59	# with the os functions, but maybe "parent" would be better
	60	return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
	61
	62
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	63	def escape(relpath):
	64	"""Escape relpath to be a valid url."""
	65	if isinstance(relpath, unicode):
	66	relpath = relpath.encode('utf-8')
	67	# After quoting and encoding, the path should be perfectly
	68	# safe as a plain ASCII string, str() just enforces this
4098.3.1 by Jonathan Lange Don't escape tildes	69	return str(urllib.quote(relpath, safe='/~'))
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	70
	71
1685.1.46 by John Arbash Meinel Sorting functions by name.	72	def file_relpath(base, path):
1685.1.46 by John Arbash Meinel Sorting functions by name.	73	"""Compute just the relative sub-portion of a url
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	74
1685.1.46 by John Arbash Meinel Sorting functions by name.	75	This assumes that both paths are already fully specified file:// URLs.
1685.1.46 by John Arbash Meinel Sorting functions by name.	76	"""
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	77	if len(base) < MIN_ABS_FILEURL_LENGTH:
4539.1.1 by Andrew Bennetts Improve error message in osutils.file_relpath.	78	raise ValueError('Length of base (%r) must equal or'
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	79	' exceed the platform minimum url length (which is %d)' %
4539.1.1 by Andrew Bennetts Improve error message in osutils.file_relpath.	80	(base, MIN_ABS_FILEURL_LENGTH))
1685.1.46 by John Arbash Meinel Sorting functions by name.	81	base = local_path_from_url(base)
1685.1.46 by John Arbash Meinel Sorting functions by name.	82	path = local_path_from_url(path)
1996.3.12 by John Arbash Meinel Change how 'revision' is imported to avoid problems later	83	return escape(osutils.relpath(base, path))
1685.1.46 by John Arbash Meinel Sorting functions by name.	84
1685.1.46 by John Arbash Meinel Sorting functions by name.	85
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	86	def _find_scheme_and_separator(url):
	87	"""Find the scheme separator (://) and the first path separator
	88
	89	This is just a helper functions for other path utilities.
	90	It could probably be replaced by urlparse
	91	"""
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	92	m = _url_scheme_re.match(url)
	93	if not m:
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	94	return None, None
	95
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	96	scheme = m.group('scheme')
	97	path = m.group('path')
	98
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	99	# Find the path separating slash
	100	# (first slash after the ://)
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	101	first_path_slash = path.find('/')
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	102	if first_path_slash == -1:
1685.1.56 by John Arbash Meinel Fixing _find_scheme_and_separator	103	return len(scheme), None
	104	return len(scheme), first_path_slash+len(scheme)+3
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	105
	106
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	107	def join(base, *args):
	108	"""Create a URL by joining sections.
	109
	110	This will normalize '..', assuming that paths are absolute
	111	(it assumes no symlinks in either path)
	112
	113	If any of *args is an absolute URL, it will be treated correctly.
	114	Example:
	115	join('http://foo', 'http://bar') => 'http://bar'
	116	join('http://foo', 'bar') => 'http://foo/bar'
	117	join('http://foo', 'bar', '../baz') => 'http://foo/baz'
	118	"""
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	119	if not args:
	120	return base
	121	match = _url_scheme_re.match(base)
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	122	scheme = None
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	123	if match:
	124	scheme = match.group('scheme')
	125	path = match.group('path').split('/')
1711.2.49 by John Arbash Meinel urlutils.join should work for root paths.	126	if path[-1:] == ['']:
	127	# Strip off a trailing slash
	128	# This helps both when we are at the root, and when
	129	# 'base' has an extra slash at the end
	130	path = path[:-1]
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	131	else:
	132	path = base.split('/')
	133
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	134	if scheme is not None and len(path) >= 1:
2018.5.93 by Andrew Bennetts Fix another bug in urlutils.join.	135	host = path[:1]
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	136	# the path should be represented as an abs path.
	137	# we know this must be absolute because of the presence of a URL scheme.
	138	remove_root = True
	139	path = [''] + path[1:]
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	140	else:
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	141	# create an empty host, but dont alter the path - this might be a
	142	# relative url fragment.
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	143	host = []
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	144	remove_root = False
	145
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	146	for arg in args:
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	147	match = _url_scheme_re.match(arg)
	148	if match:
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	149	# Absolute URL
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	150	scheme = match.group('scheme')
1986.1.10 by Robert Collins Merge from bzr.dev, fixing found bugs handling 'has('/')' in MemoryTransport and SFTP transports.	151	# this skips .. normalisation, making http://host/../../..
	152	# be rather strange.
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	153	path = match.group('path').split('/')
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	154	# set the host and path according to new absolute URL, discarding
	155	# any previous values.
	156	# XXX: duplicates mess from earlier in this function. This URL
	157	# manipulation code needs some cleaning up.
	158	if scheme is not None and len(path) >= 1:
2018.5.92 by Andrew Bennetts Small bugfix to urlutils.join: join('anything', 'http://bar/a/') should not strip the trailing slash.	159	host = path[:1]
	160	path = path[1:]
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	161	# url scheme implies absolute path.
	162	path = [''] + path
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	163	else:
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	164	# no url scheme we take the path as is.
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	165	host = []
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	166	else:
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	167	path = '/'.join(path)
	168	path = joinpath(path, arg)
	169	path = path.split('/')
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	170	if remove_root and path[0:1] == ['']:
	171	del path[0]
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	172	if host:
2018.5.92 by Andrew Bennetts Small bugfix to urlutils.join: join('anything', 'http://bar/a/') should not strip the trailing slash.	173	# Remove the leading slash from the path, so long as it isn't also the
	174	# trailing slash, which we want to keep if present.
	175	if path and path[0] == '' and len(path) > 1:
2018.5.54 by Andrew Bennetts Fix ChrootTransportDecorator's abspath method to be consistent with its clone	176	del path[0]
	177	path = host + path
1685.1.80 by Wouter van Heyst more code cleanup	178
1685.1.55 by John Arbash Meinel Adding bzrlib.urlutils.join() to handle joining URLs	179	if scheme is None:
	180	return '/'.join(path)
	181	return scheme + '://' + '/'.join(path)
	182
	183
2018.5.46 by Andrew Bennetts Fix ChrootTransportDecorator's clone to pass less surprising offsets to the decorated transport's clone.	184	def joinpath(base, *args):
	185	"""Join URL path segments to a URL path segment.
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	186
2018.5.46 by Andrew Bennetts Fix ChrootTransportDecorator's clone to pass less surprising offsets to the decorated transport's clone.	187	This is somewhat like osutils.joinpath, but intended for URLs.
	188
	189	XXX: this duplicates some normalisation logic, and also duplicates a lot of
	190	path handling logic that already exists in some Transport implementations.
	191	We really should try to have exactly one place in the code base responsible
	192	for combining paths of URLs.
	193	"""
2018.5.100 by Andrew Bennetts Fix IndexError in urlutils.join with 'http://host/a' and '../../b'.	194	path = base.split('/')
	195	if len(path) > 1 and path[-1] == '':
	196	#If the path ends in a trailing /, remove it.
	197	path.pop()
2018.5.46 by Andrew Bennetts Fix ChrootTransportDecorator's clone to pass less surprising offsets to the decorated transport's clone.	198	for arg in args:
	199	if arg.startswith('/'):
	200	path = []
	201	for chunk in arg.split('/'):
	202	if chunk == '.':
	203	continue
	204	elif chunk == '..':
	205	if path == ['']:
	206	raise errors.InvalidURLJoin('Cannot go above root',
	207	base, args)
	208	path.pop()
	209	else:
	210	path.append(chunk)
	211	if path == ['']:
	212	return '/'
	213	else:
	214	return '/'.join(path)
	215
	216
1685.1.46 by John Arbash Meinel Sorting functions by name.	217	# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
	218	def _posix_local_path_from_url(url):
	219	"""Convert a url like file:///path/to/foo into /path/to/foo"""
	220	if not url.startswith('file:///'):
	221	raise errors.InvalidURL(url, 'local urls must start with file:///')
	222	# We only strip off 2 slashes
	223	return unescape(url[len('file://'):])
	224
	225
	226	def _posix_local_path_to_url(path):
	227	"""Convert a local path like ./foo into a URL like file:///path/to/foo
	228
	229	This also handles transforming escaping unicode characters, etc.
	230	"""
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	231	# importing directly from posixpath allows us to test this
1685.1.46 by John Arbash Meinel Sorting functions by name.	232	# on non-posix platforms
1711.4.5 by John Arbash Meinel the _posix_* routines should use posixpath not os.path, so tests pass on win32	233	return 'file://' + escape(_posix_normpath(
1996.3.12 by John Arbash Meinel Change how 'revision' is imported to avoid problems later	234	osutils._posix_abspath(path)))
1685.1.46 by John Arbash Meinel Sorting functions by name.	235
	236
	237	def _win32_local_path_from_url(url):
1711.4.4 by John Arbash Meinel Fix some broken tests because of stupid ntpath.abspath behavior	238	"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	239	if not url.startswith('file://'):
	240	raise errors.InvalidURL(url, 'local urls must start with file:///, '
	241	'UNC path urls must start with file://')
1685.1.46 by John Arbash Meinel Sorting functions by name.	242	# We strip off all 3 slashes
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	243	win32_url = url[len('file:'):]
2162.2.2 by Alexander Belchenko Support for win32 UNC path (like: \\HOST\path)	244	# check for UNC path: //HOST/path
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	245	if not win32_url.startswith('///'):
2162.2.2 by Alexander Belchenko Support for win32 UNC path (like: \\HOST\path)	246	if (win32_url[2] == '/'
	247	or win32_url[3] in '\|:'):
	248	raise errors.InvalidURL(url, 'Win32 UNC path urls'
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	249	' have form file://HOST/path')
2162.2.2 by Alexander Belchenko Support for win32 UNC path (like: \\HOST\path)	250	return unescape(win32_url)
3503.1.2 by adwi2 Permits Windows to serve all paths on all drives.	251
	252	# allow empty paths so we can serve all roots
	253	if win32_url == '///':
	254	return '/'
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	255
2162.2.2 by Alexander Belchenko Support for win32 UNC path (like: \\HOST\path)	256	# usual local path with drive letter
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	257	if (win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
1711.4.4 by John Arbash Meinel Fix some broken tests because of stupid ntpath.abspath behavior	258	'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	259	or win32_url[4] not in '\|:'
	260	or win32_url[5] != '/'):
1711.4.4 by John Arbash Meinel Fix some broken tests because of stupid ntpath.abspath behavior	261	raise errors.InvalidURL(url, 'Win32 file urls start with'
1711.4.8 by John Arbash Meinel switch to prefering lowercase drive letters, since that matches os.getcwd() drive letters	262	' file:///x:/, where x is a valid drive letter')
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	263	return win32_url[3].upper() + u':' + unescape(win32_url[5:])
1685.1.46 by John Arbash Meinel Sorting functions by name.	264
	265
	266	def _win32_local_path_to_url(path):
1711.4.4 by John Arbash Meinel Fix some broken tests because of stupid ntpath.abspath behavior	267	"""Convert a local path like ./foo into a URL like file:///C:/path/to/foo
1685.1.46 by John Arbash Meinel Sorting functions by name.	268
	269	This also handles transforming escaping unicode characters, etc.
	270	"""
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	271	# importing directly from ntpath allows us to test this
1711.4.4 by John Arbash Meinel Fix some broken tests because of stupid ntpath.abspath behavior	272	# on non-win32 platform
	273	# FIXME: It turns out that on nt, ntpath.abspath uses nt._getfullpathname
	274	# which actually strips trailing space characters.
	275	# The worst part is that under linux ntpath.abspath has different
	276	# semantics, since 'nt' is not an available module.
3503.1.1 by Adrian Wilkins Add a couple of special cases to urlutils._win32_path_(from\|to)_url	277	if path == '/':
3503.1.2 by adwi2 Permits Windows to serve all paths on all drives.	278	return 'file:///'
3503.1.1 by Adrian Wilkins Add a couple of special cases to urlutils._win32_path_(from\|to)_url	279
2279.4.2 by Alexander Belchenko Don't do normpath after abspath, because this function is called inside abspath	280	win32_path = osutils._win32_abspath(path)
2162.2.2 by Alexander Belchenko Support for win32 UNC path (like: \\HOST\path)	281	# check for UNC path \\HOST\path
	282	if win32_path.startswith('//'):
2162.2.7 by Alexander Belchenko Win32 UNC path \\HOST\path mapped to URL file://HOST/path	283	return 'file:' + escape(win32_path)
3234.3.1 by Alexander Belchenko ensure that local_path_to_url() always returns plain string, not unicode.	284	return ('file:///' + str(win32_path[0].upper()) + ':' +
	285	escape(win32_path[2:]))
1685.1.46 by John Arbash Meinel Sorting functions by name.	286
	287
	288	local_path_to_url = _posix_local_path_to_url
	289	local_path_from_url = _posix_local_path_from_url
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	290	MIN_ABS_FILEURL_LENGTH = len('file:///')
1711.4.17 by John Arbash Meinel [merge] bzr.dev 1790	291	WIN32_MIN_ABS_FILEURL_LENGTH = len('file:///C:/')
1685.1.46 by John Arbash Meinel Sorting functions by name.	292
	293	if sys.platform == 'win32':
	294	local_path_to_url = _win32_local_path_to_url
	295	local_path_from_url = _win32_local_path_from_url
	296
1711.2.44 by John Arbash Meinel Factor out another win32 special case and add platform independent tests for it.	297	MIN_ABS_FILEURL_LENGTH = WIN32_MIN_ABS_FILEURL_LENGTH
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	298
	299
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	300	_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	301	_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
	302
	303
	304	def _unescape_safe_chars(matchobj):
	305	"""re.sub callback to convert hex-escapes to plain characters (if safe).
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	306
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	307	e.g. '%7E' will be converted to '~'.
	308	"""
	309	hex_digits = matchobj.group(0)[1:]
	310	char = chr(int(hex_digits, 16))
	311	if char in _url_dont_escape_characters:
	312	return char
	313	else:
	314	return matchobj.group(0).upper()
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	315
	316
	317	def normalize_url(url):
	318	"""Make sure that a path string is in fully normalized URL form.
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	319
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	320	This handles URLs which have unicode characters, spaces,
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	321	special characters, etc.
	322
	323	It has two basic modes of operation, depending on whether the
	324	supplied string starts with a url specifier (scheme://) or not.
	325	If it does not have a specifier it is considered a local path,
	326	and will be converted into a file:/// url. Non-ascii characters
	327	will be encoded using utf-8.
	328	If it does have a url specifier, it will be treated as a "hybrid"
	329	URL. Basically, a URL that should have URL special characters already
	330	escaped (like +?&# etc), but may have unicode characters, etc
	331	which would not be valid in a real URL.
	332
	333	:param url: Either a hybrid URL or a local path
	334	:return: A normalized URL which only includes 7-bit ASCII characters.
	335	"""
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	336	m = _url_scheme_re.match(url)
	337	if not m:
	338	return local_path_to_url(url)
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	339	scheme = m.group('scheme')
	340	path = m.group('path')
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	341	if not isinstance(url, unicode):
	342	for c in url:
	343	if c not in _url_safe_characters:
1685.1.53 by John Arbash Meinel Updated normalize_url	344	raise errors.InvalidURL(url, 'URLs can only contain specific'
1685.1.53 by John Arbash Meinel Updated normalize_url	345	' safe characters (not %r)' % c)
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	346	path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
	347	return str(scheme + '://' + ''.join(path))
	348
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	349	# We have a unicode (hybrid) url
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	350	path_chars = list(path)
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	351
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	352	for i in xrange(len(path_chars)):
	353	if path_chars[i] not in _url_safe_characters:
	354	chars = path_chars[i].encode('utf-8')
	355	path_chars[i] = ''.join(
	356	['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
	357	path = ''.join(path_chars)
	358	path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
	359	return str(scheme + '://' + path)
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	360
	361
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	362	def relative_url(base, other):
	363	"""Return a path to other from base.
	364
	365	If other is unrelated to base, return other. Else return a relative path.
	366	This assumes no symlinks as part of the url.
	367	"""
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	368	dummy, base_first_slash = _find_scheme_and_separator(base)
	369	if base_first_slash is None:
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	370	return other
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	371
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	372	dummy, other_first_slash = _find_scheme_and_separator(other)
	373	if other_first_slash is None:
	374	return other
	375
	376	# this takes care of differing schemes or hosts
	377	base_scheme = base[:base_first_slash]
	378	other_scheme = other[:other_first_slash]
	379	if base_scheme != other_scheme:
	380	return other
3139.2.1 by Alexander Belchenko bugfix #90847: fix problem with parent location on another logical drive	381	elif sys.platform == 'win32' and base_scheme == 'file://':
	382	base_drive = base[base_first_slash+1:base_first_slash+3]
	383	other_drive = other[other_first_slash+1:other_first_slash+3]
	384	if base_drive != other_drive:
	385	return other
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	386
	387	base_path = base[base_first_slash+1:]
	388	other_path = other[other_first_slash+1:]
	389
	390	if base_path.endswith('/'):
	391	base_path = base_path[:-1]
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	392
	393	base_sections = base_path.split('/')
	394	other_sections = other_path.split('/')
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	395
	396	if base_sections == ['']:
	397	base_sections = []
	398	if other_sections == ['']:
	399	other_sections = []
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	400
	401	output_sections = []
	402	for b, o in zip(base_sections, other_sections):
	403	if b != o:
	404	break
	405	output_sections.append(b)
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	406
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	407	match_len = len(output_sections)
1685.1.71 by Wouter van Heyst change branch.{get,set}_parent to store a relative path but return full urls	408	output_sections = ['..' for x in base_sections[match_len:]]
1685.1.70 by Wouter van Heyst working on get_parent, set_parent and relative urls, broken	409	output_sections.extend(other_sections[match_len:])
	410
	411	return "/".join(output_sections) or "."
	412
	413
1711.2.43 by John Arbash Meinel Split out win32 specific code so that it can be tested on all platforms.	414	def _win32_extract_drive_letter(url_base, path):
	415	"""On win32 the drive letter needs to be added to the url base."""
	416	# Strip off the drive letter
	417	# path is currently /C:/foo
	418	if len(path) < 3 or path[2] not in ':\|' or path[3] != '/':
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	419	raise errors.InvalidURL(url_base + path,
1711.2.43 by John Arbash Meinel Split out win32 specific code so that it can be tested on all platforms.	420	'win32 file:/// paths need a drive letter')
	421	url_base += path[0:3] # file:// + /C:
	422	path = path[3:] # /foo
	423	return url_base, path
	424
	425
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	426	def split(url, exclude_trailing_slash=True):
	427	"""Split a URL into its parent directory and a child directory.
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	428
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	429	:param url: A relative or absolute URL
	430	:param exclude_trailing_slash: Strip off a final '/' if it is part
	431	of the path (but not if it is part of the protocol specification)
1685.1.61 by Martin Pool [broken] Change BzrDir._make_tail to use urlutils.split	432
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	433	:return: (parent_url, child_dir). child_dir may be the empty string if we're at
1685.1.61 by Martin Pool [broken] Change BzrDir._make_tail to use urlutils.split	434	the root.
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	435	"""
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	436	scheme_loc, first_path_slash = _find_scheme_and_separator(url)
	437
	438	if first_path_slash is None:
	439	# We have either a relative path, or no separating slash
	440	if scheme_loc is None:
	441	# Relative path
	442	if exclude_trailing_slash and url.endswith('/'):
	443	url = url[:-1]
	444	return _posix_split(url)
	445	else:
	446	# Scheme with no path
	447	return url, ''
	448
	449	# We have a fully defined path
	450	url_base = url[:first_path_slash] # http://host, file://
	451	path = url[first_path_slash:] # /file/foo
	452
	453	if sys.platform == 'win32' and url.startswith('file:///'):
	454	# Strip off the drive letter
1711.2.43 by John Arbash Meinel Split out win32 specific code so that it can be tested on all platforms.	455	# url_base is currently file://
1711.2.39 by John Arbash Meinel Fix bzrlib.urlutils.split() to work properly on win32 local paths.	456	# path is currently /C:/foo
1711.2.43 by John Arbash Meinel Split out win32 specific code so that it can be tested on all platforms.	457	url_base, path = _win32_extract_drive_letter(url_base, path)
	458	# now it should be file:///C: and /foo
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	459
	460	if exclude_trailing_slash and len(path) > 1 and path.endswith('/'):
	461	path = path[:-1]
	462	head, tail = _posix_split(path)
	463	return url_base + head, tail
	464
1685.1.46 by John Arbash Meinel Sorting functions by name.	465
1711.2.44 by John Arbash Meinel Factor out another win32 special case and add platform independent tests for it.	466	def _win32_strip_local_trailing_slash(url):
	467	"""Strip slashes after the drive letter"""
	468	if len(url) > WIN32_MIN_ABS_FILEURL_LENGTH:
	469	return url[:-1]
	470	else:
	471	return url
	472
	473
1685.1.47 by John Arbash Meinel s comes before u	474	def strip_trailing_slash(url):
	475	"""Strip trailing slash, except for root paths.
	476
	477	The definition of 'root path' is platform-dependent.
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	478	This assumes that all URLs are valid netloc urls, such that they
	479	form:
	480	scheme://host/path
	481	It searches for ://, and then refuses to remove the next '/'.
	482	It can also handle relative paths
	483	Examples:
	484	path/to/foo => path/to/foo
	485	path/to/foo/ => path/to/foo
	486	http://host/path/ => http://host/path
	487	http://host/path => http://host/path
	488	http://host/ => http://host/
	489	file:/// => file:///
	490	file:///foo/ => file:///foo
	491	# This is unique on win32 platforms, and is the only URL
	492	# format which does it differently.
1711.4.8 by John Arbash Meinel switch to prefering lowercase drive letters, since that matches os.getcwd() drive letters	493	file:///c\|/ => file:///c:/
1685.1.47 by John Arbash Meinel s comes before u	494	"""
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	495	if not url.endswith('/'):
	496	# Nothing to do
	497	return url
2245.6.1 by Alexander Belchenko win32 UNC path: recursive cloning UNC path to root stops on //HOST, not on //	498	if sys.platform == 'win32' and url.startswith('file://'):
1711.2.44 by John Arbash Meinel Factor out another win32 special case and add platform independent tests for it.	499	return _win32_strip_local_trailing_slash(url)
1685.1.80 by Wouter van Heyst more code cleanup	500
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	501	scheme_loc, first_path_slash = _find_scheme_and_separator(url)
	502	if scheme_loc is None:
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	503	# This is a relative path, as it has no scheme
	504	# so just chop off the last character
1685.1.47 by John Arbash Meinel s comes before u	505	return url[:-1]
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	506
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	507	if first_path_slash is None or first_path_slash == len(url)-1:
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	508	# Don't chop off anything if the only slash is the path
	509	# separating slash
1685.1.47 by John Arbash Meinel s comes before u	510	return url
1685.1.47 by John Arbash Meinel s comes before u	511
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	512	return url[:-1]
	513
1685.1.47 by John Arbash Meinel s comes before u	514
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	515	def unescape(url):
	516	"""Unescape relpath from url format.
	517
	518	This returns a Unicode path from a URL
	519	"""
	520	# jam 20060427 URLs are supposed to be ASCII only strings
	521	# If they are passed in as unicode, urllib.unquote
	522	# will return a UNICODE string, which actually contains
	523	# utf-8 bytes. So we have to ensure that they are
	524	# plain ASCII strings, or the final .decode will
	525	# try to encode the UNICODE => ASCII, and then decode
	526	# it into utf-8.
	527	try:
	528	url = str(url)
	529	except UnicodeError, e:
	530	raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
1685.1.80 by Wouter van Heyst more code cleanup	531
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	532	unquoted = urllib.unquote(url)
	533	try:
	534	unicode_path = unquoted.decode('utf-8')
	535	except UnicodeError, e:
	536	raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
	537	return unicode_path
	538
	539
	540	# These are characters that if escaped, should stay that way
	541	_no_decode_chars = ';/?:@&=+$,#'
	542	_no_decode_ords = [ord(c) for c in _no_decode_chars]
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	543	_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	544	+ ['%02X' % o for o in _no_decode_ords])
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	545	_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
	546	+ [('%02X' % o, chr(o)) for o in range(256)]))
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	547	#These entries get mapped to themselves
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	548	_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	549
2208.4.1 by Andrew Bennetts normalize_url should normalise escaping of unreserved characters, like '~'.	550	# These characters shouldn't be percent-encoded, and it's always safe to
	551	# unencode them if they are.
	552	_url_dont_escape_characters = set(
	553	"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
	554	"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
	555	"0123456789" # Numbers
	556	"-._~" # Unreserved characters
	557	)
	558
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	559	# These characters should not be escaped
2167.2.2 by Aaron Bentley Update safe character list	560	_url_safe_characters = set(
	561	"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
	562	"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
	563	"0123456789" # Numbers
	564	"_.-!~*'()" # Unreserved characters
	565	"/;?:@&=+$," # Reserved characters
	566	"%#" # Extra reserved characters
	567	)
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	568
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	569	def unescape_for_display(url, encoding):
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	570	"""Decode what you can for a URL, so that we get a nice looking path.
	571
	572	This will turn file:// urls into local paths, and try to decode
	573	any portions of a http:// style url that it can.
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	574
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	575	Any sections of the URL which can't be represented in the encoding or
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	576	need to stay as escapes are left alone.
	577
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	578	:param url: A 7-bit ASCII URL
	579	:param encoding: The final output encoding
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	580
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	581	:return: A unicode string which can be safely encoded into the
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	582	specified encoding.
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	583	"""
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	584	if encoding is None:
	585	raise ValueError('you cannot specify None for the display encoding')
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	586	if url.startswith('file://'):
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	587	try:
	588	path = local_path_from_url(url)
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	589	path.encode(encoding)
	590	return path
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	591	except UnicodeError:
	592	return url
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	593
	594	# Split into sections to try to decode utf-8
	595	res = url.split('/')
	596	for i in xrange(1, len(res)):
	597	escaped_chunks = res[i].split('%')
	598	for j in xrange(1, len(escaped_chunks)):
	599	item = escaped_chunks[j]
	600	try:
	601	escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
	602	except KeyError:
	603	# Put back the percent symbol
	604	escaped_chunks[j] = '%' + item
	605	except UnicodeDecodeError:
	606	escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
	607	unescaped = ''.join(escaped_chunks)
	608	try:
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	609	decoded = unescaped.decode('utf-8')
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	610	except UnicodeDecodeError:
	611	# If this path segment cannot be properly utf-8 decoded
	612	# after doing unescaping we will just leave it alone
	613	pass
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	614	else:
	615	try:
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	616	decoded.encode(encoding)
1685.1.54 by John Arbash Meinel url_for_display now makes sure output can be properly encoded.	617	except UnicodeEncodeError:
	618	# If this chunk cannot be encoded in the local
	619	# encoding, then we should leave it alone
	620	pass
1685.1.58 by Martin Pool urlutils.unescape_for_display should return Unicode	621	else:
	622	# Otherwise take the url decoded one
	623	res[i] = decoded
	624	return u'/'.join(res)
2512.4.1 by Ian Clatworthy Fixes #115491 - 'branch lp:projname' now creates ./projname as exected	625
	626
	627	def derive_to_location(from_location):
	628	"""Derive a TO_LOCATION given a FROM_LOCATION.
	629
	630	The normal case is a FROM_LOCATION of http://foo/bar => bar.
	631	The Right Thing for some logical destinations may differ though
	632	because no / may be present at all. In that case, the result is
	633	the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar.
	634	This latter case also applies when a Windows drive
	635	is used without a path, e.g. c:foo-bar => foo-bar.
	636	If no /, path separator or : is found, the from_location is returned.
	637	"""
	638	if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0:
	639	return os.path.basename(from_location.rstrip("/\\"))
	640	else:
	641	sep = from_location.find(":")
	642	if sep > 0:
	643	return from_location[sep+1:]
	644	else:
	645	return from_location
3242.3.26 by Aaron Bentley Implement rebase_url	646
3242.3.35 by Aaron Bentley Cleanups and documentation	647
3242.3.26 by Aaron Bentley Implement rebase_url	648	def _is_absolute(url):
	649	return (osutils.pathjoin('/foo', url) == url)
	650
3242.3.35 by Aaron Bentley Cleanups and documentation	651
3242.3.26 by Aaron Bentley Implement rebase_url	652	def rebase_url(url, old_base, new_base):
	653	"""Convert a relative path from an old base URL to a new base URL.
	654
	655	The result will be a relative path.
	656	Absolute paths and full URLs are returned unaltered.
	657	"""
	658	scheme, separator = _find_scheme_and_separator(url)
	659	if scheme is not None:
	660	return url
	661	if _is_absolute(url):
	662	return url
	663	old_parsed = urlparse.urlparse(old_base)
	664	new_parsed = urlparse.urlparse(new_base)
	665	if (old_parsed[:2]) != (new_parsed[:2]):
3242.3.33 by Aaron Bentley Handle relative URL stacking cleanly	666	raise errors.InvalidRebaseURLs(old_base, new_base)
3242.3.36 by Aaron Bentley Updates from review comments	667	return determine_relative_path(new_parsed[2],
3567.2.1 by Michael Hudson urlutils.rebase_url handles '..' path segments in 'url'	668	join(old_parsed[2], url))
3242.3.26 by Aaron Bentley Implement rebase_url	669
	670
	671	def determine_relative_path(from_path, to_path):
	672	"""Determine a relative path from from_path to to_path."""
	673	from_segments = osutils.splitpath(from_path)
	674	to_segments = osutils.splitpath(to_path)
	675	count = -1
	676	for count, (from_element, to_element) in enumerate(zip(from_segments,
	677	to_segments)):
	678	if from_element != to_element:
	679	break
	680	else:
	681	count += 1
	682	unique_from = from_segments[count:]
	683	unique_to = to_segments[count:]
	684	segments = (['..'] * len(unique_from) + unique_to)
	685	if len(segments) == 0:
	686	return '.'
	687	return osutils.pathjoin(*segments)
3873.3.1 by Martin Pool Move Transport._split_url to urlutils, and ad a simple test	688
	689
	690
	691	def parse_url(url):
	692	"""Extract the server address, the credentials and the path from the url.
	693
	694	user, password, host and path should be quoted if they contain reserved
	695	chars.
	696
	697	:param url: an quoted url
	698
	699	:return: (scheme, user, password, host, port, path) tuple, all fields
	700	are unquoted.
	701	"""
	702	if isinstance(url, unicode):
	703	raise errors.InvalidURL('should be ascii:\n%r' % url)
	704	url = url.encode('utf-8')
	705	(scheme, netloc, path, params,
	706	query, fragment) = urlparse.urlparse(url, allow_fragments=False)
	707	user = password = host = port = None
	708	if '@' in netloc:
	709	user, host = netloc.rsplit('@', 1)
	710	if ':' in user:
	711	user, password = user.split(':', 1)
	712	password = urllib.unquote(password)
	713	user = urllib.unquote(user)
	714	else:
	715	host = netloc
	716
4253.4.2 by Jelmer Vernooij Still parse port in case of ipv6.	717	if ':' in host and not (host[0] == '[' and host[-1] == ']'): #there is port
	718	host, port = host.rsplit(':',1)
	719	try:
	720	port = int(port)
	721	except ValueError:
	722	raise errors.InvalidURL('invalid port number %s in url:\n%s' %
	723	(port, url))
4253.4.3 by Jelmer Vernooij Support empty host name.	724	if host != "" and host[0] == '[' and host[-1] == ']': #IPv6
4253.4.2 by Jelmer Vernooij Still parse port in case of ipv6.	725	host = host[1:-1]
3873.3.2 by Martin Pool Accept ipv6 literals in URLs	726
3873.3.1 by Martin Pool Move Transport._split_url to urlutils, and ad a simple test	727	host = urllib.unquote(host)
	728	path = urllib.unquote(path)
	729
	730	return (scheme, user, password, host, port, path)