42
40
def _ensure_utf8_re():
43
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
44
global _utf8_re, _unicode_re
46
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
47
if _unicode_re is None:
48
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
51
def _unicode_escape_replace(match, _map=_xml_escape_map):
41
"""Make sure the _utf8_re regex has been compiled"""
43
if _utf8_re is not None:
45
_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
48
def _utf8_escape_replace(match, _map=_utf8_escape_map):
52
49
"""Replace a string of non-ascii, non XML safe characters with their escape
54
51
This will escape both Standard XML escapes, like <>"', etc.
67
64
return "&#%d;" % ord(match.group())
70
def _utf8_escape_replace(match, _map=_xml_escape_map):
71
"""Escape utf8 characters into XML safe ones.
73
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
74
or it is handling characters with the high-bit set. For ascii characters,
75
we just lookup the replacement in the dictionary. For everything else, we
76
decode back into Unicode, and then use the XML escape code.
79
return _map[match.group()]
81
return ''.join('&#%d;' % ord(uni_chr)
82
for uni_chr in match.group().decode('utf8'))
87
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
67
_unicode_to_escaped_map = {}
69
def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):
88
70
"""Encode the string into utf8, and escape invalid XML characters"""
89
71
# We frequently get entities we have not seen before, so it is better
90
72
# to check if None, rather than try/KeyError
91
text = _map.get(unicode_or_utf8_str)
73
text = _map.get(unicode_str)
93
if unicode_or_utf8_str.__class__ == unicode:
94
# The alternative policy is to do a regular UTF8 encoding
95
# and then escape only XML meta characters.
96
# Performance is equivalent once you use cache_utf8. *However*
97
# this makes the serialized texts incompatible with old versions
98
# of bzr. So no net gain. (Perhaps the read code would handle utf8
99
# better than entity escapes, but cElementTree seems to do just fine
101
text = str(_unicode_re.sub(_unicode_escape_replace,
102
unicode_or_utf8_str)) + '"'
104
# Plain strings are considered to already be in utf-8 so we do a
105
# slightly different method for escaping.
106
text = _utf8_re.sub(_utf8_escape_replace,
107
unicode_or_utf8_str) + '"'
108
_map[unicode_or_utf8_str] = text
75
# The alternative policy is to do a regular UTF8 encoding
76
# and then escape only XML meta characters.
77
# Performance is equivalent once you use cache_utf8. *However*
78
# this makes the serialized texts incompatible with old versions
79
# of bzr. So no net gain. (Perhaps the read code would handle utf8
80
# better than entity escapes, but cElementTree seems to do just fine
82
text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'
83
_map[unicode_str] = text
112
def _get_utf8_or_ascii(a_str,
113
_encode_utf8=cache_utf8.encode,
114
_get_cached_ascii=cache_utf8.get_cached_ascii):
115
"""Return a cached version of the string.
117
cElementTree will return a plain string if the XML is plain ascii. It only
118
returns Unicode when it needs to. We want to work in utf-8 strings. So if
119
cElementTree returns a plain string, we can just return the cached version.
120
If it is Unicode, then we need to encode it.
122
:param a_str: An 8-bit string or Unicode as returned by
123
cElementTree.Element.get()
124
:return: A utf-8 encoded 8-bit string.
126
# This is fairly optimized because we know what cElementTree does, this is
127
# not meant as a generic function for all cases. Because it is possible for
128
# an 8-bit string to not be ascii or valid utf8.
129
if a_str.__class__ == unicode:
130
return _encode_utf8(a_str)
132
return _get_cached_ascii(a_str)
135
87
def _clear_cache():
136
88
"""Clean out the unicode => escaped map"""
137
_to_escaped_map.clear()
89
_unicode_to_escaped_map.clear()
140
92
class Serializer_v5(Serializer):
232
179
def _pack_revision(self, rev):
233
180
"""Revision object -> xml tree"""
234
# For the XML format, we need to write them as Unicode rather than as
235
# utf-8 strings. So that cElementTree can handle properly escaping
237
decode_utf8 = cache_utf8.decode
238
revision_id = rev.revision_id
239
if isinstance(revision_id, str):
240
revision_id = decode_utf8(revision_id)
241
181
root = Element('revision',
242
182
committer = rev.committer,
243
timestamp = '%.3f' % rev.timestamp,
244
revision_id = revision_id,
183
timestamp = '%.9f' % rev.timestamp,
184
revision_id = rev.revision_id,
245
185
inventory_sha1 = rev.inventory_sha1,
291
227
revision_id = elt.get('revision_id')
292
228
if revision_id is not None:
293
revision_id = cache_utf8.encode(revision_id)
229
revision_id = cache_utf8.get_cached_unicode(revision_id)
294
230
inv = Inventory(root_id, revision_id=revision_id)
296
232
ie = self._unpack_entry(e)
297
if ie.parent_id is None:
233
if ie.parent_id == ROOT_ID:
298
234
ie.parent_id = root_id
302
def _unpack_entry(self, elt):
238
def _unpack_entry(self, elt, none_parents=False):
304
240
if not InventoryEntry.versionable_kind(kind):
305
241
raise AssertionError('unsupported entry kind %s' % kind)
307
get_cached = _get_utf8_or_ascii
243
get_cached = cache_utf8.get_cached_unicode
309
245
parent_id = elt.get('parent_id')
310
if parent_id is not None:
311
parent_id = get_cached(parent_id)
312
file_id = get_cached(elt.get('file_id'))
246
if parent_id is None and not none_parents:
248
# TODO: jam 20060817 At present, caching file ids costs us too
249
# much time. It slows down overall read performances from
250
# approx 500ms to 700ms. And doesn't improve future reads.
251
# it might be because revision ids and file ids are mixing.
252
# Consider caching *just* the file ids, for a limited period
254
#parent_id = get_cached(parent_id)
255
#file_id = get_cached(elt.get('file_id'))
256
file_id = elt.get('file_id')
314
258
if kind == 'directory':
315
259
ie = inventory.InventoryDirectory(file_id,