40
42
def _ensure_utf8_re():
41
"""Make sure the _utf8_re regex has been compiled"""
43
if _utf8_re is not None:
45
_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
48
def _utf8_escape_replace(match, _map=_utf8_escape_map):
43
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
44
global _utf8_re, _unicode_re
46
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
47
if _unicode_re is None:
48
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
51
def _unicode_escape_replace(match, _map=_xml_escape_map):
49
52
"""Replace a string of non-ascii, non XML safe characters with their escape
51
54
This will escape both Standard XML escapes, like <>"', etc.
64
67
return "&#%d;" % ord(match.group())
67
_unicode_to_escaped_map = {}
69
def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):
70
def _utf8_escape_replace(match, _map=_xml_escape_map):
71
"""Escape utf8 characters into XML safe ones.
73
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
74
or it is handling characters with the high-bit set. For ascii characters,
75
we just lookup the replacement in the dictionary. For everything else, we
76
decode back into Unicode, and then use the XML escape code.
79
return _map[match.group()]
81
return ''.join('&#%d;' % ord(uni_chr)
82
for uni_chr in match.group().decode('utf8'))
87
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
70
88
"""Encode the string into utf8, and escape invalid XML characters"""
71
89
# We frequently get entities we have not seen before, so it is better
72
90
# to check if None, rather than try/KeyError
73
text = _map.get(unicode_str)
91
text = _map.get(unicode_or_utf8_str)
75
# The alternative policy is to do a regular UTF8 encoding
76
# and then escape only XML meta characters.
77
# Performance is equivalent once you use cache_utf8. *However*
78
# this makes the serialized texts incompatible with old versions
79
# of bzr. So no net gain. (Perhaps the read code would handle utf8
80
# better than entity escapes, but cElementTree seems to do just fine
82
text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'
83
_map[unicode_str] = text
93
if unicode_or_utf8_str.__class__ == unicode:
94
# The alternative policy is to do a regular UTF8 encoding
95
# and then escape only XML meta characters.
96
# Performance is equivalent once you use cache_utf8. *However*
97
# this makes the serialized texts incompatible with old versions
98
# of bzr. So no net gain. (Perhaps the read code would handle utf8
99
# better than entity escapes, but cElementTree seems to do just fine
101
text = str(_unicode_re.sub(_unicode_escape_replace,
102
unicode_or_utf8_str)) + '"'
104
# Plain strings are considered to already be in utf-8 so we do a
105
# slightly different method for escaping.
106
text = _utf8_re.sub(_utf8_escape_replace,
107
unicode_or_utf8_str) + '"'
108
_map[unicode_or_utf8_str] = text
112
def _get_utf8_or_ascii(a_str,
113
_encode_utf8=cache_utf8.encode,
114
_get_cached_ascii=cache_utf8.get_cached_ascii):
115
"""Return a cached version of the string.
117
cElementTree will return a plain string if the XML is plain ascii. It only
118
returns Unicode when it needs to. We want to work in utf-8 strings. So if
119
cElementTree returns a plain string, we can just return the cached version.
120
If it is Unicode, then we need to encode it.
122
:param a_str: An 8-bit string or Unicode as returned by
123
cElementTree.Element.get()
124
:return: A utf-8 encoded 8-bit string.
126
# This is fairly optimized because we know what cElementTree does, this is
127
# not meant as a generic function for all cases. Because it is possible for
128
# an 8-bit string to not be ascii or valid utf8.
129
if a_str.__class__ == unicode:
130
return _encode_utf8(a_str)
132
return _get_cached_ascii(a_str)
87
135
def _clear_cache():
88
136
"""Clean out the unicode => escaped map"""
89
_unicode_to_escaped_map.clear()
137
_to_escaped_map.clear()
92
140
class Serializer_v5(Serializer):
179
232
def _pack_revision(self, rev):
180
233
"""Revision object -> xml tree"""
234
# For the XML format, we need to write them as Unicode rather than as
235
# utf-8 strings. So that cElementTree can handle properly escaping
237
decode_utf8 = cache_utf8.decode
238
revision_id = rev.revision_id
239
if isinstance(revision_id, str):
240
revision_id = decode_utf8(revision_id)
181
241
root = Element('revision',
182
242
committer = rev.committer,
183
243
timestamp = '%.3f' % rev.timestamp,
184
revision_id = rev.revision_id,
244
revision_id = revision_id,
185
245
inventory_sha1 = rev.inventory_sha1,
227
291
revision_id = elt.get('revision_id')
228
292
if revision_id is not None:
229
revision_id = cache_utf8.get_cached_unicode(revision_id)
293
revision_id = cache_utf8.encode(revision_id)
230
294
inv = Inventory(root_id, revision_id=revision_id)
232
296
ie = self._unpack_entry(e)
233
if ie.parent_id == ROOT_ID:
297
if ie.parent_id is None:
234
298
ie.parent_id = root_id
238
def _unpack_entry(self, elt, none_parents=False):
302
def _unpack_entry(self, elt):
240
304
if not InventoryEntry.versionable_kind(kind):
241
305
raise AssertionError('unsupported entry kind %s' % kind)
243
get_cached = cache_utf8.get_cached_unicode
307
get_cached = _get_utf8_or_ascii
245
309
parent_id = elt.get('parent_id')
246
if parent_id is None and not none_parents:
248
# TODO: jam 20060817 At present, caching file ids costs us too
249
# much time. It slows down overall read performances from
250
# approx 500ms to 700ms. And doesn't improve future reads.
251
# it might be because revision ids and file ids are mixing.
252
# Consider caching *just* the file ids, for a limited period
254
#parent_id = get_cached(parent_id)
255
#file_id = get_cached(elt.get('file_id'))
256
file_id = elt.get('file_id')
310
if parent_id is not None:
311
parent_id = get_cached(parent_id)
312
file_id = get_cached(elt.get('file_id'))
258
314
if kind == 'directory':
259
315
ie = inventory.InventoryDirectory(file_id,