43
40
def _ensure_utf8_re():
44
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
45
global _utf8_re, _unicode_re
47
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
48
if _unicode_re is None:
49
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
52
def _unicode_escape_replace(match, _map=_xml_escape_map):
41
"""Make sure the _utf8_re regex has been compiled"""
43
if _utf8_re is not None:
45
_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
48
def _utf8_escape_replace(match, _map=_utf8_escape_map):
53
49
"""Replace a string of non-ascii, non XML safe characters with their escape
55
51
This will escape both Standard XML escapes, like <>"', etc.
68
64
return "&#%d;" % ord(match.group())
71
def _utf8_escape_replace(match, _map=_xml_escape_map):
72
"""Escape utf8 characters into XML safe ones.
74
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
75
or it is handling characters with the high-bit set. For ascii characters,
76
we just lookup the replacement in the dictionary. For everything else, we
77
decode back into Unicode, and then use the XML escape code.
80
return _map[match.group()]
82
return ''.join('&#%d;' % ord(uni_chr)
83
for uni_chr in match.group().decode('utf8'))
88
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
67
_unicode_to_escaped_map = {}
69
def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):
89
70
"""Encode the string into utf8, and escape invalid XML characters"""
90
71
# We frequently get entities we have not seen before, so it is better
91
72
# to check if None, rather than try/KeyError
92
text = _map.get(unicode_or_utf8_str)
73
text = _map.get(unicode_str)
94
if unicode_or_utf8_str.__class__ == unicode:
95
# The alternative policy is to do a regular UTF8 encoding
96
# and then escape only XML meta characters.
97
# Performance is equivalent once you use cache_utf8. *However*
98
# this makes the serialized texts incompatible with old versions
99
# of bzr. So no net gain. (Perhaps the read code would handle utf8
100
# better than entity escapes, but cElementTree seems to do just fine
102
text = str(_unicode_re.sub(_unicode_escape_replace,
103
unicode_or_utf8_str)) + '"'
105
# Plain strings are considered to already be in utf-8 so we do a
106
# slightly different method for escaping.
107
text = _utf8_re.sub(_utf8_escape_replace,
108
unicode_or_utf8_str) + '"'
109
_map[unicode_or_utf8_str] = text
75
# The alternative policy is to do a regular UTF8 encoding
76
# and then escape only XML meta characters.
77
# Performance is equivalent once you use cache_utf8. *However*
78
# this makes the serialized texts incompatible with old versions
79
# of bzr. So no net gain. (Perhaps the read code would handle utf8
80
# better than entity escapes, but cElementTree seems to do just fine
82
text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'
83
_map[unicode_str] = text
113
def _get_utf8_or_ascii(a_str,
114
_encode_utf8=cache_utf8.encode,
115
_get_cached_ascii=cache_utf8.get_cached_ascii):
116
"""Return a cached version of the string.
118
cElementTree will return a plain string if the XML is plain ascii. It only
119
returns Unicode when it needs to. We want to work in utf-8 strings. So if
120
cElementTree returns a plain string, we can just return the cached version.
121
If it is Unicode, then we need to encode it.
123
:param a_str: An 8-bit string or Unicode as returned by
124
cElementTree.Element.get()
125
:return: A utf-8 encoded 8-bit string.
127
# This is fairly optimized because we know what cElementTree does, this is
128
# not meant as a generic function for all cases. Because it is possible for
129
# an 8-bit string to not be ascii or valid utf8.
130
if a_str.__class__ == unicode:
131
return _encode_utf8(a_str)
133
return _get_cached_ascii(a_str)
136
87
def _clear_cache():
137
88
"""Clean out the unicode => escaped map"""
138
_to_escaped_map.clear()
89
_unicode_to_escaped_map.clear()
141
92
class Serializer_v5(Serializer):
234
179
def _pack_revision(self, rev):
235
180
"""Revision object -> xml tree"""
236
# For the XML format, we need to write them as Unicode rather than as
237
# utf-8 strings. So that cElementTree can handle properly escaping
239
decode_utf8 = cache_utf8.decode
240
revision_id = rev.revision_id
241
if isinstance(revision_id, str):
242
revision_id = decode_utf8(revision_id)
243
181
root = Element('revision',
244
182
committer = rev.committer,
245
timestamp = '%.3f' % rev.timestamp,
246
revision_id = revision_id,
183
timestamp = '%.9f' % rev.timestamp,
184
revision_id = rev.revision_id,
247
185
inventory_sha1 = rev.inventory_sha1,
258
196
pelts.tail = pelts.text = '\n'
259
197
for parent_id in rev.parent_ids:
260
198
assert isinstance(parent_id, basestring)
261
_mod_revision.check_not_reserved_id(parent_id)
262
199
p = SubElement(pelts, 'revision_ref')
264
if isinstance(parent_id, str):
265
parent_id = decode_utf8(parent_id)
266
201
p.set('revision_id', parent_id)
267
202
if rev.properties:
268
203
self._pack_revision_properties(rev, root)
294
227
revision_id = elt.get('revision_id')
295
228
if revision_id is not None:
296
revision_id = cache_utf8.encode(revision_id)
229
revision_id = cache_utf8.get_cached_unicode(revision_id)
297
230
inv = Inventory(root_id, revision_id=revision_id)
299
232
ie = self._unpack_entry(e)
300
if ie.parent_id is None:
233
if ie.parent_id == ROOT_ID:
301
234
ie.parent_id = root_id
305
def _unpack_entry(self, elt):
238
def _unpack_entry(self, elt, none_parents=False):
307
240
if not InventoryEntry.versionable_kind(kind):
308
241
raise AssertionError('unsupported entry kind %s' % kind)
310
get_cached = _get_utf8_or_ascii
243
get_cached = cache_utf8.get_cached_unicode
312
245
parent_id = elt.get('parent_id')
313
if parent_id is not None:
314
parent_id = get_cached(parent_id)
315
file_id = get_cached(elt.get('file_id'))
246
if parent_id is None and not none_parents:
248
# TODO: jam 20060817 At present, caching file ids costs us too
249
# much time. It slows down overall read performances from
250
# approx 500ms to 700ms. And doesn't improve future reads.
251
# it might be because revision ids and file ids are mixing.
252
# Consider caching *just* the file ids, for a limited period
254
#parent_id = get_cached(parent_id)
255
#file_id = get_cached(elt.get('file_id'))
256
file_id = elt.get('file_id')
317
258
if kind == 'directory':
318
259
ie = inventory.InventoryDirectory(file_id,