130
137
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
131
138
lambda match: match.group(0).encode('unicode_escape'),
142
def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
143
"""Return a cached version of the string.
145
cElementTree will return a plain string if the XML is plain ascii. It only
146
returns Unicode when it needs to. We want to work in utf-8 strings. So if
147
cElementTree returns a plain string, we can just return the cached version.
148
If it is Unicode, then we need to encode it.
150
:param a_str: An 8-bit string or Unicode as returned by
151
cElementTree.Element.get()
152
:return: A utf-8 encoded 8-bit string.
154
# This is fairly optimized because we know what cElementTree does, this is
155
# not meant as a generic function for all cases. Because it is possible for
156
# an 8-bit string to not be ascii or valid utf8.
157
if a_str.__class__ is unicode:
158
return _encode_utf8(a_str)
163
_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')
164
_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
169
"'":"'", # FIXME: overkill
176
def _unicode_escape_replace(match, _map=_xml_escape_map):
177
"""Replace a string of non-ascii, non XML safe characters with their escape
179
This will escape both Standard XML escapes, like <>"', etc.
180
As well as escaping non ascii characters, because ElementTree did.
181
This helps us remain compatible to older versions of bzr. We may change
182
our policy in the future, though.
184
# jam 20060816 Benchmarks show that try/KeyError is faster if you
185
# expect the entity to rarely miss. There is about a 10% difference
186
# in overall time. But if you miss frequently, then if None is much
187
# faster. For our use case, we *rarely* have a revision id, file id
188
# or path name that is unicode. So use try/KeyError.
190
return _map[match.group()]
192
return "&#%d;" % ord(match.group())
195
def _utf8_escape_replace(match, _map=_xml_escape_map):
196
"""Escape utf8 characters into XML safe ones.
198
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
199
or it is handling characters with the high-bit set. For ascii characters,
200
we just lookup the replacement in the dictionary. For everything else, we
201
decode back into Unicode, and then use the XML escape code.
204
return _map[match.group()]
206
return ''.join('&#%d;' % ord(uni_chr)
207
for uni_chr in match.group().decode('utf8'))
212
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
213
"""Encode the string into utf8, and escape invalid XML characters"""
214
# We frequently get entities we have not seen before, so it is better
215
# to check if None, rather than try/KeyError
216
text = _map.get(unicode_or_utf8_str)
218
if unicode_or_utf8_str.__class__ is unicode:
219
# The alternative policy is to do a regular UTF8 encoding
220
# and then escape only XML meta characters.
221
# Performance is equivalent once you use cache_utf8. *However*
222
# this makes the serialized texts incompatible with old versions
223
# of bzr. So no net gain. (Perhaps the read code would handle utf8
224
# better than entity escapes, but cElementTree seems to do just fine
226
text = str(_unicode_re.sub(_unicode_escape_replace,
227
unicode_or_utf8_str)) + '"'
229
# Plain strings are considered to already be in utf-8 so we do a
230
# slightly different method for escaping.
231
text = _utf8_re.sub(_utf8_escape_replace,
232
unicode_or_utf8_str) + '"'
233
_map[unicode_or_utf8_str] = text
238
"""Clean out the unicode => escaped map"""
239
_to_escaped_map.clear()
242
def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
244
file_id = elt_get('file_id')
245
revision = elt_get('revision')
246
# Check and see if we have already unpacked this exact entry
247
# Some timings for "repo.revision_trees(last_100_revs)"
249
# unmodified 4.1s 40.8s
251
# using fifo 2.83s 29.1s
255
# no_copy 2.00s 20.5s
256
# no_c,dict 1.95s 18.0s
257
# Note that a cache of 10k nodes is more than sufficient to hold all of
258
# the inventory for the last 100 revs for bzr, but not for mysql (20k
259
# is enough for mysql, which saves the same 2s as using a dict)
261
# Breakdown of mysql using time.clock()
262
# 4.1s 2 calls to element.get for file_id, revision_id
263
# 4.5s cache_hit lookup
264
# 7.1s InventoryFile.copy()
265
# 2.4s InventoryDirectory.copy()
266
# 0.4s decoding unique entries
267
# 1.6s decoding entries after FIFO fills up
268
# 0.8s Adding nodes to FIFO (including flushes)
269
# 0.1s cache miss lookups
271
# 4.1s 2 calls to element.get for file_id, revision_id
272
# 9.9s cache_hit lookup
273
# 10.8s InventoryEntry.copy()
274
# 0.3s cache miss lookus
275
# 1.2s decoding entries
276
# 1.0s adding nodes to LRU
277
if entry_cache is not None and revision is not None:
278
key = (file_id, revision)
280
# We copy it, because some operations may mutate it
281
cached_ie = entry_cache[key]
285
# Only copying directory entries drops us 2.85s => 2.35s
286
if return_from_cache:
287
if cached_ie.kind == 'directory':
288
return cached_ie.copy()
290
return cached_ie.copy()
293
if not inventory.InventoryEntry.versionable_kind(kind):
294
raise AssertionError('unsupported entry kind %s' % kind)
296
file_id = get_utf8_or_ascii(file_id)
297
if revision is not None:
298
revision = get_utf8_or_ascii(revision)
299
parent_id = elt_get('parent_id')
300
if parent_id is not None:
301
parent_id = get_utf8_or_ascii(parent_id)
303
if kind == 'directory':
304
ie = inventory.InventoryDirectory(file_id,
308
ie = inventory.InventoryFile(file_id,
311
ie.text_sha1 = elt_get('text_sha1')
312
if elt_get('executable') == 'yes':
314
v = elt_get('text_size')
315
ie.text_size = v and int(v)
316
elif kind == 'symlink':
317
ie = inventory.InventoryLink(file_id,
320
ie.symlink_target = elt_get('symlink_target')
322
raise errors.UnsupportedInventoryKind(kind)
323
ie.revision = revision
324
if revision is not None and entry_cache is not None:
325
# We cache a copy() because callers like to mutate objects, and
326
# that would cause the item in cache to mutate as well.
327
# This has a small effect on many-inventory performance, because
328
# the majority fraction is spent in cache hits, not misses.
329
entry_cache[key] = ie.copy()
334
def unpack_inventory_flat(elt, format_num, unpack_entry,
335
entry_cache=None, return_from_cache=False):
336
"""Unpack a flat XML inventory.
338
:param elt: XML element for the inventory
339
:param format_num: Expected format number
340
:param unpack_entry: Function for unpacking inventory entries
341
:return: An inventory
342
:raise UnexpectedInventoryFormat: When unexpected elements or data is
345
if elt.tag != 'inventory':
346
raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
347
format = elt.get('format')
348
if format != format_num:
349
raise errors.UnexpectedInventoryFormat('Invalid format version %r'
351
revision_id = elt.get('revision_id')
352
if revision_id is not None:
353
revision_id = cache_utf8.encode(revision_id)
354
inv = inventory.Inventory(root_id=None, revision_id=revision_id)
356
ie = unpack_entry(e, entry_cache, return_from_cache)
361
def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
362
"""Serialize an inventory to a flat XML file.
364
:param inv: Inventory to serialize
365
:param append: Function for writing a line of output
366
:param working: If True skip history data - text_sha1, text_size,
367
reference_revision, symlink_target. self._check_revisions(inv)
369
entries = inv.iter_entries()
371
root_path, root_ie = entries.next()
372
for path, ie in entries:
373
if ie.parent_id != root_id:
374
parent_str = ' parent_id="'
375
parent_id = encode_and_escape(ie.parent_id)
379
if ie.kind == 'file':
381
executable = ' executable="yes"'
385
append('<file%s file_id="%s name="%s%s%s revision="%s '
386
'text_sha1="%s" text_size="%d" />\n' % (
387
executable, encode_and_escape(ie.file_id),
388
encode_and_escape(ie.name), parent_str, parent_id,
389
encode_and_escape(ie.revision), ie.text_sha1,
392
append('<file%s file_id="%s name="%s%s%s />\n' % (
393
executable, encode_and_escape(ie.file_id),
394
encode_and_escape(ie.name), parent_str, parent_id))
395
elif ie.kind == 'directory':
397
append('<directory file_id="%s name="%s%s%s revision="%s '
399
encode_and_escape(ie.file_id),
400
encode_and_escape(ie.name),
401
parent_str, parent_id,
402
encode_and_escape(ie.revision)))
404
append('<directory file_id="%s name="%s%s%s />\n' % (
405
encode_and_escape(ie.file_id),
406
encode_and_escape(ie.name),
407
parent_str, parent_id))
408
elif ie.kind == 'symlink':
410
append('<symlink file_id="%s name="%s%s%s revision="%s '
411
'symlink_target="%s />\n' % (
412
encode_and_escape(ie.file_id),
413
encode_and_escape(ie.name),
414
parent_str, parent_id,
415
encode_and_escape(ie.revision),
416
encode_and_escape(ie.symlink_target)))
418
append('<symlink file_id="%s name="%s%s%s />\n' % (
419
encode_and_escape(ie.file_id),
420
encode_and_escape(ie.name),
421
parent_str, parent_id))
422
elif ie.kind == 'tree-reference':
423
if ie.kind not in supported_kinds:
424
raise errors.UnsupportedInventoryKind(ie.kind)
426
append('<tree-reference file_id="%s name="%s%s%s '
427
'revision="%s reference_revision="%s />\n' % (
428
encode_and_escape(ie.file_id),
429
encode_and_escape(ie.name),
430
parent_str, parent_id,
431
encode_and_escape(ie.revision),
432
encode_and_escape(ie.reference_revision)))
434
append('<tree-reference file_id="%s name="%s%s%s />\n' % (
435
encode_and_escape(ie.file_id),
436
encode_and_escape(ie.name),
437
parent_str, parent_id))
439
raise errors.UnsupportedInventoryKind(ie.kind)
440
append('</inventory>\n')