~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml5.py

Committer: John Arbash Meinel
Date: 2007-02-09 23:15:53 UTC
mto: This revision was merged to the branch mainline in revision 2294.
Revision ID: john@arbash-meinel.com-20070209231553-9ywoxs2t3dsx667s

Make sure xml5 can handle unicode or utf8 strings

files modified:
bzrlib/tests/test_xml.py

bzrlib/xml5.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml5.py

_utf8_re = None

_utf8_escape_map = {

_unicode_re = None

_xml_escape_map = {

"&":'&',

"'":"'", # FIXME: overkill

"\"":""",

def _ensure_utf8_re():

"""Make sure the _utf8_re regex has been compiled"""

global _utf8_re

if _utf8_re is not None:

return

_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _utf8_escape_replace(match, _map=_utf8_escape_map):

"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""

global _utf8_re, _unicode_re

if _utf8_re is None:

_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')

if _unicode_re is None:

_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _unicode_escape_replace(match, _map=_xml_escape_map):

"""Replace a string of non-ascii, non XML safe characters with their escape

This will escape both Standard XML escapes, like <>"', etc.

return "&#%d;" % ord(match.group())

_unicode_to_escaped_map = {}

def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):

def _utf8_escape_replace(match, _map=_xml_escape_map):

"""Escape utf8 characters into XML safe ones.

This uses 2 tricks. It is either escaping "standard" characters, like "&<>,

or it is handling characters with the high-bit set. For ascii characters,

we just lookup the replacement in the dictionary. For everything else, we

decode back into Unicode, and then use the XML escape code.

"""

try:

return _map[match.group()]

except KeyError:

return ''.join('&#%d;' % ord(uni_chr)

for uni_chr in match.group().decode('utf8'))

_to_escaped_map = {}

def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):

"""Encode the string into utf8, and escape invalid XML characters"""

# We frequently get entities we have not seen before, so it is better

# to check if None, rather than try/KeyError

text = _map.get(unicode_str)

text = _map.get(unicode_or_utf8_str)

if text is None:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'

_map[unicode_str] = text

if unicode_or_utf8_str.__class__ == unicode:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

100

text = str(_unicode_re.sub(_unicode_escape_replace,

101

unicode_or_utf8_str)) + '"'

102

else:

103

# Plain strings are considered to already be in utf-8 so we do a

104

# slightly different method for escaping.

105

text = _utf8_re.sub(_utf8_escape_replace,

106

unicode_or_utf8_str) + '"'

107

_map[unicode_or_utf8_str] = text

108

return text

109

110

109

133

110

134

def _clear_cache():

111

135

"""Clean out the unicode => escaped map"""

112

_unicode_to_escaped_map.clear()

136

_to_escaped_map.clear()

113

137

114

138

115

139

class Serializer_v5(Serializer):

Older »