40
41
def _ensure_utf8_re():
41
"""Make sure the _utf8_re regex has been compiled"""
43
if _utf8_re is not None:
45
_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
48
def _utf8_escape_replace(match, _map=_utf8_escape_map):
42
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
43
global _utf8_re, _unicode_re
45
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
46
if _unicode_re is None:
47
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
50
def _unicode_escape_replace(match, _map=_xml_escape_map):
49
51
"""Replace a string of non-ascii, non XML safe characters with their escape
51
53
This will escape both Standard XML escapes, like <>"', etc.
64
66
return "&#%d;" % ord(match.group())
67
_unicode_to_escaped_map = {}
69
def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):
69
def _utf8_escape_replace(match, _map=_xml_escape_map):
70
"""Escape utf8 characters into XML safe ones.
72
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
73
or it is handling characters with the high-bit set. For ascii characters,
74
we just lookup the replacement in the dictionary. For everything else, we
75
decode back into Unicode, and then use the XML escape code.
78
return _map[match.group()]
80
return ''.join('&#%d;' % ord(uni_chr)
81
for uni_chr in match.group().decode('utf8'))
86
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
70
87
"""Encode the string into utf8, and escape invalid XML characters"""
71
88
# We frequently get entities we have not seen before, so it is better
72
89
# to check if None, rather than try/KeyError
73
text = _map.get(unicode_str)
90
text = _map.get(unicode_or_utf8_str)
75
# The alternative policy is to do a regular UTF8 encoding
76
# and then escape only XML meta characters.
77
# Performance is equivalent once you use cache_utf8. *However*
78
# this makes the serialized texts incompatible with old versions
79
# of bzr. So no net gain. (Perhaps the read code would handle utf8
80
# better than entity escapes, but cElementTree seems to do just fine
82
text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'
83
_map[unicode_str] = text
92
if unicode_or_utf8_str.__class__ == unicode:
93
# The alternative policy is to do a regular UTF8 encoding
94
# and then escape only XML meta characters.
95
# Performance is equivalent once you use cache_utf8. *However*
96
# this makes the serialized texts incompatible with old versions
97
# of bzr. So no net gain. (Perhaps the read code would handle utf8
98
# better than entity escapes, but cElementTree seems to do just fine
100
text = str(_unicode_re.sub(_unicode_escape_replace,
101
unicode_or_utf8_str)) + '"'
103
# Plain strings are considered to already be in utf-8 so we do a
104
# slightly different method for escaping.
105
text = _utf8_re.sub(_utf8_escape_replace,
106
unicode_or_utf8_str) + '"'
107
_map[unicode_or_utf8_str] = text