~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml5.py

Committer: John Arbash Meinel
Date: 2006-08-16 21:19:42 UTC
mto: This revision was merged to the branch mainline in revision 1942.
Revision ID: john@arbash-meinel.com-20060816211942-00cff30d95d6d1c2

rewrite escaper to use xml numerical entities, rather than using encode('utf8')

files modified:
bzrlib/benchmarks/bench_xml.py

bzrlib/xml5.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml5.py

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import cStringIO

import re

from bzrlib import (

cache_utf8,

from bzrlib.errors import BzrError

_unicode_to_escaped_map = {}

_utf8_re = None

_utf8_escape_map = {

"&":'&',

"'":"'", # FIXME: overkill

"\"":""",

"<":"<",

">":">",

}

def _ensure_utf8_re():

"""Make sure the _utf8_re regex has been compiled"""

global _utf8_re

if _utf8_re is not None:

return

_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _utf8_escape_replace(match, _map=_utf8_escape_map):

"""Replace a string of non-ascii, non XML safe characters with their escape

This will escape both Standard XML escapes, like <>"', etc.

As well as escaping non ascii characters, because ElementTree did.

This helps us remain compatible to older versions of bzr. We may change

our policy in the future, though.

"""

# TODO: jam 20060816 Benchmark this, is it better to use try/except or

# to use _map.get() and check for None.

# Or still further, it might be better to pre-generate all

# possible conversions. However, the occurance of unicode

# characters is quite low, so an initial guess is that this

# is the most efficient method

# Also need to benchmark whether it is better to have a regex

# which matches multiple characters, or if it is better to

# only match a single character and call this function multiple

# times. The chance that we actually need multiple escapes

# is probably very low for our expected usage

try:

return _map[match.group()]

except KeyError:

return "&#%d;" % ord(match.group())

def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):

"""Encode the string into utf8, and escape invalid XML characters"""

try:

return _map[unicode_str]

except KeyError:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters. This could take

# advantage of cache_utf8 since a lot of the revision ids

# and file ids would already be cached.

text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str))

_map[unicode_str] = text

return text

class Serializer_v5(Serializer):

"""Version 5 serializer

Packs objects into XML and vice versa.

"""

__slots__ = ['_utf8_re']

__slots__ = []

def __init__(self):

self._utf8_re = None

def write_inventory_to_string(self, inv):

"""Just call write_inventory with a StringIO and return the value"""

sio = cStringIO.StringIO()

100

self.write_inventory(inv, sio)

101

return sio.getvalue()

106

:param inv: the inventory to write.

107

:param f: the file to write.

108

"""

109

_ensure_utf8_re()

110

output = []

111

self._append_inventory_root(output, inv)

112

entries = inv.iter_entries()

113

# Skip the root

114

root_path, root_ie = entries.next()

115

for path, ie in entries:

116

self._append_entry(output, ie)

f.write(''.join(output))

# elt = self._pack_inventory(inv)

# for child in elt.getchildren():

# if isinstance(child, inventory.InventoryDirectory):

# print "foo\nbar\n"

# print child

# ElementTree(child).write(f, 'utf-8')

f.write('</inventory>\n')

117

output.append('</inventory>\n')

118

f.writelines(output)

119

120

def _append_inventory_root(self, output, inv):

121

"""Append the inventory root to output."""

110

164

111

165

def _append_utf8_escaped(self, output, a_string):

112

166

"""Append a_string to output as utf8."""

113

if self._utf8_re is None:

114

import re

115

self._utf8_re = re.compile("[&'\"<>]")

116

# escape attribute value

117

text = a_string.encode('utf8')

118

output.append(self._utf8_re.sub(self._utf8_escape_replace, text))

167

#output.append(_encode_and_escape(a_string))

168

text = str(_utf8_re.sub(_utf8_escape_replace, a_string))

169

output.append(text)

119

170

output.append('"')

120

171

121

_utf8_escape_map = {

122

"&":'&',

123

"'":"'", # FIXME: overkill

124

"\"":""",

125

"<":"<",

126

">":">",

127

}

128

def _utf8_escape_replace(self, match, map=_utf8_escape_map):

129

return map[match.group()]

130

131

172

def _pack_inventory(self, inv):

132

173

"""Convert to XML Element"""

133

174

entries = inv.iter_entries()

Older »