292
292
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
293
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
296
def _unescape_safe_chars(matchobj):
297
"""re.sub callback to convert hex-escapes to plain characters (if safe).
299
e.g. '%7E' will be converted to '~'.
301
hex_digits = matchobj.group(0)[1:]
302
char = chr(int(hex_digits, 16))
303
if char in _url_dont_escape_characters:
306
return matchobj.group(0).upper()
295
309
def normalize_url(url):
296
310
"""Make sure that a path string is in fully normalized URL form.
298
This handles URLs which have unicode characters, spaces,
312
This handles URLs which have unicode characters, spaces,
299
313
special characters, etc.
301
315
It has two basic modes of operation, depending on whether the
314
328
m = _url_scheme_re.match(url)
316
330
return local_path_to_url(url)
331
scheme = m.group('scheme')
332
path = m.group('path')
317
333
if not isinstance(url, unicode):
319
335
if c not in _url_safe_characters:
320
336
raise errors.InvalidURL(url, 'URLs can only contain specific'
321
337
' safe characters (not %r)' % c)
338
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
339
return str(scheme + '://' + ''.join(path))
323
341
# We have a unicode (hybrid) url
324
scheme = m.group('scheme')
325
path = list(m.group('path'))
342
path_chars = list(path)
327
for i in xrange(len(path)):
328
if path[i] not in _url_safe_characters:
329
chars = path[i].encode('utf-8')
330
path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
331
return str(scheme + '://' + ''.join(path))
344
for i in xrange(len(path_chars)):
345
if path_chars[i] not in _url_safe_characters:
346
chars = path_chars[i].encode('utf-8')
347
path_chars[i] = ''.join(
348
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
349
path = ''.join(path_chars)
350
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
351
return str(scheme + '://' + path)
334
354
def relative_url(base, other):
514
534
#These entries get mapped to themselves
515
535
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
537
# These characters shouldn't be percent-encoded, and it's always safe to
538
# unencode them if they are.
539
_url_dont_escape_characters = set(
540
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
541
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
542
"0123456789" # Numbers
543
"-._~" # Unreserved characters
517
546
# These characters should not be escaped
518
547
_url_safe_characters = set(
519
548
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha