~bzr-pqm/bzr/bzr.dev : contents of bzrlib/xml8.py at revision 5820.1.1

~bzr-pqm/bzr/bzr.dev : (revision 5820.1.1)

4763.2.4 by John Arbash Meinel merge bzr.2.1 in preparation for NEWS entry.	1	# Copyright (C) 2005-2010 Canonical Ltd
1773.4.1 by Martin Pool Add pyflakes makefile target; fix many warnings	2	#
1189 by Martin Pool - BROKEN: partial support for commit into weave	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
1887.1.1 by Adeodato Simó Do not separate paragraphs in the copyright statement with blank lines,	7	#
1189 by Martin Pool - BROKEN: partial support for commit into weave	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
1887.1.1 by Adeodato Simó Do not separate paragraphs in the copyright statement with blank lines,	12	#
1189 by Martin Pool - BROKEN: partial support for commit into weave	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1189 by Martin Pool - BROKEN: partial support for commit into weave	16
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	17	import cStringIO
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	18	import re
1189 by Martin Pool - BROKEN: partial support for commit into weave	19
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	20	from bzrlib import (
	21	cache_utf8,
2100.3.1 by Aaron Bentley Start roundtripping tree-reference entries	22	errors,
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	23	inventory,
5671.2.3 by Jelmer Vernooij Move Repository._find_text_key_references_from_xml_inventory_lines onto the serializer.	24	lazy_regex,
2598.5.2 by Aaron Bentley Got all tests passing with Branch returning 'null:' for null revision	25	revision as _mod_revision,
3882.6.3 by John Arbash Meinel If we are going to thrash the inventory entry cache, increase its size.	26	trace,
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	27	)
4237.3.1 by Jelmer Vernooij Add new module with generic serializer information; keep XML-specific bits in	28	from bzrlib.xml_serializer import (
	29	Element,
	30	SubElement,
	31	XMLSerializer,
4416.5.1 by Jelmer Vernooij Move squashing of XML-invalid characters to XMLSerializer.	32	escape_invalid_chars,
4237.3.1 by Jelmer Vernooij Add new module with generic serializer information; keep XML-specific bits in	33	)
5121.2.4 by Jelmer Vernooij Remove more unused imports.	34	from bzrlib.inventory import InventoryEntry
1773.4.1 by Martin Pool Add pyflakes makefile target; fix many warnings	35	from bzrlib.revision import Revision
1189 by Martin Pool - BROKEN: partial support for commit into weave	36	from bzrlib.errors import BzrError
	37
	38
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	39	_utf8_re = None
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	40	_unicode_re = None
	41	_xml_escape_map = {
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	42	"&":'&',
	43	"'":"'", # FIXME: overkill
	44	"\"":""",
	45	"<":"<",
	46	">":">",
	47	}
	48
5671.2.3 by Jelmer Vernooij Move Repository._find_text_key_references_from_xml_inventory_lines onto the serializer.	49	_xml_unescape_map = {
	50	'apos':"'",
	51	'quot':'"',
	52	'amp':'&',
	53	'lt':'<',
	54	'gt':'>'
	55	}
	56
	57
	58	def _unescaper(match, _map=_xml_unescape_map):
	59	code = match.group(1)
	60	try:
	61	return _map[code]
	62	except KeyError:
	63	if not code.startswith('#'):
	64	raise
	65	return unichr(int(code[1:])).encode('utf8')
	66
	67
	68	_unescape_re = None
	69
	70
	71	def _unescape_xml(data):
	72	"""Unescape predefined XML entities in a string of data."""
	73	global _unescape_re
	74	if _unescape_re is None:
	75	_unescape_re = re.compile('\&([^;]*);')
	76	return _unescape_re.sub(_unescaper, data)
	77
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	78
	79	def _ensure_utf8_re():
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	80	"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
	81	global _utf8_re, _unicode_re
	82	if _utf8_re is None:
	83	_utf8_re = re.compile('[&<>\'\"]\|[\x80-\xff]+')
	84	if _unicode_re is None:
	85	_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
	86
	87
	88	def _unicode_escape_replace(match, _map=_xml_escape_map):
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	89	"""Replace a string of non-ascii, non XML safe characters with their escape
	90
	91	This will escape both Standard XML escapes, like <>"', etc.
	92	As well as escaping non ascii characters, because ElementTree did.
	93	This helps us remain compatible to older versions of bzr. We may change
	94	our policy in the future, though.
	95	"""
1934.1.7 by John Arbash Meinel Comment why we do caching the way we do	96	# jam 20060816 Benchmarks show that try/KeyError is faster if you
	97	# expect the entity to rarely miss. There is about a 10% difference
	98	# in overall time. But if you miss frequently, then if None is much
	99	# faster. For our use case, we rarely have a revision id, file id
	100	# or path name that is unicode. So use try/KeyError.
1934.1.12 by John Arbash Meinel Switch back to using Entity serializer, since performance is equivalent, yet still compatible	101	try:
	102	return _map[match.group()]
	103	except KeyError:
	104	return "&#%d;" % ord(match.group())
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	105
	106
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	107	def _utf8_escape_replace(match, _map=_xml_escape_map):
	108	"""Escape utf8 characters into XML safe ones.
	109
	110	This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
	111	or it is handling characters with the high-bit set. For ascii characters,
	112	we just lookup the replacement in the dictionary. For everything else, we
	113	decode back into Unicode, and then use the XML escape code.
	114	"""
	115	try:
	116	return _map[match.group()]
	117	except KeyError:
	118	return ''.join('&#%d;' % ord(uni_chr)
	119	for uni_chr in match.group().decode('utf8'))
	120
	121
	122	_to_escaped_map = {}
	123
	124	def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	125	"""Encode the string into utf8, and escape invalid XML characters"""
1934.1.7 by John Arbash Meinel Comment why we do caching the way we do	126	# We frequently get entities we have not seen before, so it is better
	127	# to check if None, rather than try/KeyError
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	128	text = _map.get(unicode_or_utf8_str)
1934.1.5 by John Arbash Meinel Cache the entity escaping cuts us down to 450ms	129	if text is None:
4088.3.1 by Benjamin Peterson compare types with 'is' not ==	130	if unicode_or_utf8_str.__class__ is unicode:
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	131	# The alternative policy is to do a regular UTF8 encoding
	132	# and then escape only XML meta characters.
	133	# Performance is equivalent once you use cache_utf8. However
	134	# this makes the serialized texts incompatible with old versions
	135	# of bzr. So no net gain. (Perhaps the read code would handle utf8
	136	# better than entity escapes, but cElementTree seems to do just fine
	137	# either way)
	138	text = str(_unicode_re.sub(_unicode_escape_replace,
	139	unicode_or_utf8_str)) + '"'
	140	else:
	141	# Plain strings are considered to already be in utf-8 so we do a
	142	# slightly different method for escaping.
	143	text = _utf8_re.sub(_utf8_escape_replace,
	144	unicode_or_utf8_str) + '"'
	145	_map[unicode_or_utf8_str] = text
1934.1.5 by John Arbash Meinel Cache the entity escaping cuts us down to 450ms	146	return text
	147
	148
2249.5.4 by John Arbash Meinel When reading XML, always return utf-8 revision ids.	149	def _get_utf8_or_ascii(a_str,
	150	_encode_utf8=cache_utf8.encode,
	151	_get_cached_ascii=cache_utf8.get_cached_ascii):
	152	"""Return a cached version of the string.
	153
	154	cElementTree will return a plain string if the XML is plain ascii. It only
	155	returns Unicode when it needs to. We want to work in utf-8 strings. So if
	156	cElementTree returns a plain string, we can just return the cached version.
	157	If it is Unicode, then we need to encode it.
	158
	159	:param a_str: An 8-bit string or Unicode as returned by
	160	cElementTree.Element.get()
	161	:return: A utf-8 encoded 8-bit string.
	162	"""
	163	# This is fairly optimized because we know what cElementTree does, this is
	164	# not meant as a generic function for all cases. Because it is possible for
	165	# an 8-bit string to not be ascii or valid utf8.
4088.3.1 by Benjamin Peterson compare types with 'is' not ==	166	if a_str.__class__ is unicode:
2249.5.4 by John Arbash Meinel When reading XML, always return utf-8 revision ids.	167	return _encode_utf8(a_str)
	168	else:
4075.3.2 by John Arbash Meinel Use intern() instead of _get_cached_ascii for getting unique revision_ids and file_ids.	169	return intern(a_str)
2249.5.4 by John Arbash Meinel When reading XML, always return utf-8 revision ids.	170
	171
1934.1.5 by John Arbash Meinel Cache the entity escaping cuts us down to 450ms	172	def _clear_cache():
	173	"""Clean out the unicode => escaped map"""
2249.5.10 by John Arbash Meinel Make sure xml5 can handle unicode or utf8 strings	174	_to_escaped_map.clear()
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	175
	176
4237.3.1 by Jelmer Vernooij Add new module with generic serializer information; keep XML-specific bits in	177	class Serializer_v8(XMLSerializer):
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	178	"""This serialiser adds rich roots.
1189 by Martin Pool - BROKEN: partial support for commit into weave	179
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	180	Its revision format number matches its inventory number.
1189 by Martin Pool - BROKEN: partial support for commit into weave	181	"""
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	182
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	183	__slots__ = []
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	184
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	185	root_id = None
1910.2.48 by Aaron Bentley Update from review comments	186	support_altered_by_hack = True
	187	# This format supports the altered-by hack that reads file ids directly out
	188	# of the versionedfile, without doing XML parsing.
	189
2100.3.1 by Aaron Bentley Start roundtripping tree-reference entries	190	supported_kinds = set(['file', 'directory', 'symlink'])
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	191	format_num = '8'
3311.3.3 by Aaron Bentley Handle format 5 revision	192	revision_format_num = None
2100.3.1 by Aaron Bentley Start roundtripping tree-reference entries	193
5671.2.3 by Jelmer Vernooij Move Repository._find_text_key_references_from_xml_inventory_lines onto the serializer.	194	# The search regex used by xml based repositories to determine what things
	195	# where changed in a single commit.
	196	_file_ids_altered_regex = lazy_regex.lazy_compile(
	197	r'file_id="(?P<file_id>[^"]+)"'
	198	r'.* revision="(?P<revision_id>[^"]+)"'
	199	)
	200
2889.1.1 by Robert Collins * The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into	201	def _check_revisions(self, inv):
	202	"""Extension point for subclasses to check during serialisation.
	203
	204	:param inv: An inventory about to be serialised, to be checked.
4031.3.1 by Frank Aspell Fixing various typos	205	:raises: AssertionError if an error has occurred.
2889.1.1 by Robert Collins * The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into	206	"""
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	207	if inv.revision_id is None:
4505.5.2 by Robert Collins More informative assertions in xml serialisation.	208	raise AssertionError("inv.revision_id is None")
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	209	if inv.root.revision is None:
4505.5.2 by Robert Collins More informative assertions in xml serialisation.	210	raise AssertionError("inv.root.revision is None")
2889.1.1 by Robert Collins * The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into	211
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	212	def _check_cache_size(self, inv_size, entry_cache):
	213	"""Check that the entry_cache is large enough.
3882.6.12 by John Arbash Meinel Use resize logic to ensure our inventory entry cache is at an optimal size.	214
	215	We want the cache to be ~2x the size of an inventory. The reason is
	216	because we use a FIFO cache, and how Inventory records are likely to
	217	change. In general, you have a small number of records which change
	218	often, and a lot of records which do not change at all. So when the
	219	cache gets full, you actually flush out a lot of the records you are
	220	interested in, which means you need to recreate all of those records.
	221	An LRU Cache would be better, but the overhead negates the cache
	222	coherency benefit.
	223
	224	One way to look at it, only the size of the cache > len(inv) is your
	225	'working' set. And in general, it shouldn't be a problem to hold 2
	226	inventories in memory anyway.
	227
	228	:param inv_size: The number of entries in an inventory.
	229	"""
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	230	if entry_cache is None:
	231	return
3882.6.12 by John Arbash Meinel Use resize logic to ensure our inventory entry cache is at an optimal size.	232	# 1.5 times might also be reasonable.
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	233	recommended_min_cache_size = inv_size * 1.5
	234	if entry_cache.cache_size() < recommended_min_cache_size:
	235	recommended_cache_size = inv_size * 2
	236	trace.mutter('Resizing the inventory entry cache from %d to %d',
	237	entry_cache.cache_size(), recommended_cache_size)
	238	entry_cache.resize(recommended_cache_size)
3882.6.12 by John Arbash Meinel Use resize logic to ensure our inventory entry cache is at an optimal size.	239
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	240	def write_inventory_to_lines(self, inv):
	241	"""Return a list of lines with the encoded inventory."""
	242	return self.write_inventory(inv, None)
	243
	244	def write_inventory_to_string(self, inv, working=False):
	245	"""Just call write_inventory with a StringIO and return the value.
	246
	247	:param working: If True skip history data - text_sha1, text_size,
	248	reference_revision, symlink_target.
	249	"""
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	250	sio = cStringIO.StringIO()
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	251	self.write_inventory(inv, sio, working)
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	252	return sio.getvalue()
	253
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	254	def write_inventory(self, inv, f, working=False):
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	255	"""Write inventory to a file.
3943.8.1 by Marius Kruger remove all trailing whitespace from bzr source	256
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	257	:param inv: the inventory to write.
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	258	:param f: the file to write. (May be None if the lines are the desired
	259	output).
	260	:param working: If True skip history data - text_sha1, text_size,
	261	reference_revision, symlink_target.
	262	:return: The inventory as a list of lines.
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	263	"""
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	264	_ensure_utf8_re()
2889.1.1 by Robert Collins * The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into	265	self._check_revisions(inv)
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	266	output = []
1934.1.8 by John Arbash Meinel Passing around the append function rather than the list shaves off another 10%, down to 400ms	267	append = output.append
	268	self._append_inventory_root(append, inv)
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	269	entries = inv.iter_entries()
1934.1.4 by John Arbash Meinel rewrite escaper to use xml numerical entities, rather than using encode('utf8')	270	# Skip the root
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	271	root_path, root_ie = entries.next()
	272	for path, ie in entries:
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	273	if ie.parent_id != self.root_id:
	274	parent_str = ' parent_id="'
	275	parent_id = _encode_and_escape(ie.parent_id)
	276	else:
	277	parent_str = ''
	278	parent_id = ''
	279	if ie.kind == 'file':
	280	if ie.executable:
	281	executable = ' executable="yes"'
	282	else:
	283	executable = ''
	284	if not working:
	285	append('<file%s file_id="%s name="%s%s%s revision="%s '
	286	'text_sha1="%s" text_size="%d" />\n' % (
	287	executable, _encode_and_escape(ie.file_id),
	288	_encode_and_escape(ie.name), parent_str, parent_id,
	289	_encode_and_escape(ie.revision), ie.text_sha1,
	290	ie.text_size))
	291	else:
	292	append('<file%s file_id="%s name="%s%s%s />\n' % (
	293	executable, _encode_and_escape(ie.file_id),
	294	_encode_and_escape(ie.name), parent_str, parent_id))
	295	elif ie.kind == 'directory':
	296	if not working:
	297	append('<directory file_id="%s name="%s%s%s revision="%s '
	298	'/>\n' % (
	299	_encode_and_escape(ie.file_id),
	300	_encode_and_escape(ie.name),
	301	parent_str, parent_id,
	302	_encode_and_escape(ie.revision)))
	303	else:
	304	append('<directory file_id="%s name="%s%s%s />\n' % (
	305	_encode_and_escape(ie.file_id),
	306	_encode_and_escape(ie.name),
	307	parent_str, parent_id))
	308	elif ie.kind == 'symlink':
	309	if not working:
	310	append('<symlink file_id="%s name="%s%s%s revision="%s '
	311	'symlink_target="%s />\n' % (
	312	_encode_and_escape(ie.file_id),
	313	_encode_and_escape(ie.name),
	314	parent_str, parent_id,
	315	_encode_and_escape(ie.revision),
	316	_encode_and_escape(ie.symlink_target)))
	317	else:
	318	append('<symlink file_id="%s name="%s%s%s />\n' % (
	319	_encode_and_escape(ie.file_id),
	320	_encode_and_escape(ie.name),
	321	parent_str, parent_id))
	322	elif ie.kind == 'tree-reference':
	323	if ie.kind not in self.supported_kinds:
	324	raise errors.UnsupportedInventoryKind(ie.kind)
	325	if not working:
	326	append('<tree-reference file_id="%s name="%s%s%s '
	327	'revision="%s reference_revision="%s />\n' % (
	328	_encode_and_escape(ie.file_id),
	329	_encode_and_escape(ie.name),
	330	parent_str, parent_id,
	331	_encode_and_escape(ie.revision),
	332	_encode_and_escape(ie.reference_revision)))
	333	else:
	334	append('<tree-reference file_id="%s name="%s%s%s />\n' % (
	335	_encode_and_escape(ie.file_id),
	336	_encode_and_escape(ie.name),
337	parent_str, parent_id))
338	else:
339	raise errors.UnsupportedInventoryKind(ie.kind)
1934.1.8 by John Arbash Meinel Passing around the append function rather than the list shaves off another 10%, down to 400ms	340	append('</inventory>\n')
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	341	if f is not None:
	342	f.writelines(output)
1934.1.5 by John Arbash Meinel Cache the entity escaping cuts us down to 450ms	343	# Just to keep the cache from growing without bounds
	344	# but we may actually not want to do clear the cache
1934.1.6 by John Arbash Meinel With a full cache the time is down to 381 ms	345	#_clear_cache()
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	346	return output
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	347
1934.1.8 by John Arbash Meinel Passing around the append function rather than the list shaves off another 10%, down to 400ms	348	def _append_inventory_root(self, append, inv):
1934.1.3 by John Arbash Meinel [merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences	349	"""Append the inventory root to output."""
	350	if inv.revision_id is not None:
2817.2.1 by Robert Collins * Inventory serialisation no longer double-sha's the content.	351	revid1 = ' revision_id="'
	352	revid2 = _encode_and_escape(inv.revision_id)
	353	else:
	354	revid1 = ""
	355	revid2 = ""
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	356	append('<inventory format="%s"%s%s>\n' % (
	357	self.format_num, revid1, revid2))
	358	append('<directory file_id="%s name="%s revision="%s />\n' % (
	359	_encode_and_escape(inv.root.file_id),
	360	_encode_and_escape(inv.root.name),
	361	_encode_and_escape(inv.root.revision)))
	362
1189 by Martin Pool - BROKEN: partial support for commit into weave	363	def _pack_revision(self, rev):
	364	"""Revision object -> xml tree"""
2249.5.5 by John Arbash Meinel better comment for why we are decoding	365	# For the XML format, we need to write them as Unicode rather than as
	366	# utf-8 strings. So that cElementTree can handle properly escaping
	367	# them.
2249.5.4 by John Arbash Meinel When reading XML, always return utf-8 revision ids.	368	decode_utf8 = cache_utf8.decode
2249.5.5 by John Arbash Meinel better comment for why we are decoding	369	revision_id = rev.revision_id
	370	if isinstance(revision_id, str):
	371	revision_id = decode_utf8(revision_id)
3311.3.3 by Aaron Bentley Handle format 5 revision	372	format_num = self.format_num
	373	if self.revision_format_num is not None:
	374	format_num = self.revision_format_num
1189 by Martin Pool - BROKEN: partial support for commit into weave	375	root = Element('revision',
	376	committer = rev.committer,
2102.4.1 by John Arbash Meinel Switch to using millisecond resolution in Revision XML	377	timestamp = '%.3f' % rev.timestamp,
2249.5.5 by John Arbash Meinel better comment for why we are decoding	378	revision_id = revision_id,
1189 by Martin Pool - BROKEN: partial support for commit into weave	379	inventory_sha1 = rev.inventory_sha1,
3311.3.3 by Aaron Bentley Handle format 5 revision	380	format=format_num,
1189 by Martin Pool - BROKEN: partial support for commit into weave	381	)
1913.1.1 by John Arbash Meinel Fix bug #55783	382	if rev.timezone is not None:
1189 by Martin Pool - BROKEN: partial support for commit into weave	383	root.set('timezone', str(rev.timezone))
	384	root.text = '\n'
	385	msg = SubElement(root, 'message')
4416.5.1 by Jelmer Vernooij Move squashing of XML-invalid characters to XMLSerializer.	386	msg.text = escape_invalid_chars(rev.message)[0]
1189 by Martin Pool - BROKEN: partial support for commit into weave	387	msg.tail = '\n'
1313 by Martin Pool - rename to Revision.parent_ids to avoid confusion with old usage	388	if rev.parent_ids:
1189 by Martin Pool - BROKEN: partial support for commit into weave	389	pelts = SubElement(root, 'parents')
	390	pelts.tail = pelts.text = '\n'
1313 by Martin Pool - rename to Revision.parent_ids to avoid confusion with old usage	391	for parent_id in rev.parent_ids:
2598.5.2 by Aaron Bentley Got all tests passing with Branch returning 'null:' for null revision	392	_mod_revision.check_not_reserved_id(parent_id)
1189 by Martin Pool - BROKEN: partial support for commit into weave	393	p = SubElement(pelts, 'revision_ref')
	394	p.tail = '\n'
2249.5.5 by John Arbash Meinel better comment for why we are decoding	395	if isinstance(parent_id, str):
	396	parent_id = decode_utf8(parent_id)
	397	p.set('revision_id', parent_id)
1185.16.36 by Martin Pool - store revision properties in revision xml	398	if rev.properties:
	399	self._pack_revision_properties(rev, root)
1189 by Martin Pool - BROKEN: partial support for commit into weave	400	return root
1185.16.36 by Martin Pool - store revision properties in revision xml	401
	402	def _pack_revision_properties(self, rev, under_element):
	403	top_elt = SubElement(under_element, 'properties')
	404	for prop_name, prop_value in sorted(rev.properties.items()):
	405	prop_elt = SubElement(top_elt, 'property')
	406	prop_elt.set('name', prop_name)
	407	prop_elt.text = prop_value
	408	prop_elt.tail = '\n'
	409	top_elt.tail = '\n'
	410
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	411	def _unpack_inventory(self, elt, revision_id=None, entry_cache=None,
	412	return_from_cache=False):
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	413	"""Construct from XML Element"""
	414	if elt.tag != 'inventory':
	415	raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
1393.1.59 by Martin Pool - put 'format=5' on inventory and revision xml	416	format = elt.get('format')
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	417	if format != self.format_num:
	418	raise errors.UnexpectedInventoryFormat('Invalid format version %r'
	419	% format)
	420	revision_id = elt.get('revision_id')
	421	if revision_id is not None:
	422	revision_id = cache_utf8.encode(revision_id)
	423	inv = inventory.Inventory(root_id=None, revision_id=revision_id)
1189 by Martin Pool - BROKEN: partial support for commit into weave	424	for e in elt:
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	425	ie = self._unpack_entry(e, entry_cache=entry_cache,
	426	return_from_cache=return_from_cache)
1189 by Martin Pool - BROKEN: partial support for commit into weave	427	inv.add(ie)
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	428	self._check_cache_size(len(inv), entry_cache)
1189 by Martin Pool - BROKEN: partial support for commit into weave	429	return inv
	430
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	431	def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	432	elt_get = elt.get
	433	file_id = elt_get('file_id')
	434	revision = elt_get('revision')
	435	# Check and see if we have already unpacked this exact entry
3882.6.8 by John Arbash Meinel Add detailed timings on the last 100 mysql revisions.	436	# Some timings for "repo.revision_trees(last_100_revs)"
	437	# bzr mysql
	438	# unmodified 4.1s 40.8s
3882.6.6 by John Arbash Meinel Add some actual timings, supporting why we use a FIFOCache.	439	# using lru 3.5s
3882.6.8 by John Arbash Meinel Add detailed timings on the last 100 mysql revisions.	440	# using fifo 2.83s 29.1s
3882.6.6 by John Arbash Meinel Add some actual timings, supporting why we use a FIFOCache.	441	# lru._cache 2.8s
3882.6.8 by John Arbash Meinel Add detailed timings on the last 100 mysql revisions.	442	# dict 2.75s 26.8s
	443	# inv.add 2.5s 26.0s
	444	# no_copy 2.00s 20.5s
	445	# no_c,dict 1.95s 18.0s
3882.6.6 by John Arbash Meinel Add some actual timings, supporting why we use a FIFOCache.	446	# Note that a cache of 10k nodes is more than sufficient to hold all of
3882.6.9 by John Arbash Meinel Add some more direct timings using time.clock() instead of lsprof.	447	# the inventory for the last 100 revs for bzr, but not for mysql (20k
	448	# is enough for mysql, which saves the same 2s as using a dict)
	449
	450	# Breakdown of mysql using time.clock()
	451	# 4.1s 2 calls to element.get for file_id, revision_id
	452	# 4.5s cache_hit lookup
	453	# 7.1s InventoryFile.copy()
	454	# 2.4s InventoryDirectory.copy()
	455	# 0.4s decoding unique entries
3882.6.11 by John Arbash Meinel comment update	456	# 1.6s decoding entries after FIFO fills up
3882.6.9 by John Arbash Meinel Add some more direct timings using time.clock() instead of lsprof.	457	# 0.8s Adding nodes to FIFO (including flushes)
	458	# 0.1s cache miss lookups
	459	# Using an LRU cache
	460	# 4.1s 2 calls to element.get for file_id, revision_id
	461	# 9.9s cache_hit lookup
	462	# 10.8s InventoryEntry.copy()
	463	# 0.3s cache miss lookus
	464	# 1.2s decoding entries
	465	# 1.0s adding nodes to LRU
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	466	if entry_cache is not None and revision is not None:
	467	key = (file_id, revision)
	468	try:
4031.3.1 by Frank Aspell Fixing various typos	469	# We copy it, because some operations may mutate it
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	470	cached_ie = entry_cache[key]
	471	except KeyError:
	472	pass
	473	else:
	474	# Only copying directory entries drops us 2.85s => 2.35s
4849.4.2 by John Arbash Meinel Change from being a per-serializer attribute to being a per-repo attribute.	475	if return_from_cache:
4849.4.1 by John Arbash Meinel Add a flag that controls if we will return InventoryEntries from the cache.	476	if cached_ie.kind == 'directory':
	477	return cached_ie.copy()
	478	return cached_ie
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	479	return cached_ie.copy()
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	480
1189 by Martin Pool - BROKEN: partial support for commit into weave	481	kind = elt.tag
1399.1.6 by Robert Collins move exporting functionality into inventory.py - uncovers bug in symlink support	482	if not InventoryEntry.versionable_kind(kind):
1092.2.20 by Robert Collins symlink and weaves, whaddya know	483	raise AssertionError('unsupported entry kind %s' % kind)
1189 by Martin Pool - BROKEN: partial support for commit into weave	484
3882.6.13 by John Arbash Meinel We don't need to inline get_cached until we've had the miss.	485	get_cached = _get_utf8_or_ascii
	486
3882.6.1 by John Arbash Meinel Add an InventoryEntry cache to the xml deserializer.	487	file_id = get_cached(file_id)
	488	if revision is not None:
	489	revision = get_cached(revision)
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	490	parent_id = elt_get('parent_id')
2294.1.10 by John Arbash Meinel Switch all apis over to utf8 file ids. All tests pass	491	if parent_id is not None:
	492	parent_id = get_cached(parent_id)
1189 by Martin Pool - BROKEN: partial support for commit into weave	493
1399.1.8 by Robert Collins factor out inventory directory logic into 'InventoryDirectory' class	494	if kind == 'directory':
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	495	ie = inventory.InventoryDirectory(file_id,
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	496	elt_get('name'),
1399.1.8 by Robert Collins factor out inventory directory logic into 'InventoryDirectory' class	497	parent_id)
1399.1.9 by Robert Collins factor out file related logic from InventoryEntry to InventoryFile	498	elif kind == 'file':
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	499	ie = inventory.InventoryFile(file_id,
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	500	elt_get('name'),
1399.1.9 by Robert Collins factor out file related logic from InventoryEntry to InventoryFile	501	parent_id)
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	502	ie.text_sha1 = elt_get('text_sha1')
	503	if elt_get('executable') == 'yes':
1399.1.9 by Robert Collins factor out file related logic from InventoryEntry to InventoryFile	504	ie.executable = True
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	505	v = elt_get('text_size')
1399.1.9 by Robert Collins factor out file related logic from InventoryEntry to InventoryFile	506	ie.text_size = v and int(v)
1399.1.10 by Robert Collins remove kind from the InventoryEntry constructor - only child classes should be created now	507	elif kind == 'symlink':
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	508	ie = inventory.InventoryLink(file_id,
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	509	elt_get('name'),
1399.1.10 by Robert Collins remove kind from the InventoryEntry constructor - only child classes should be created now	510	parent_id)
3882.6.5 by John Arbash Meinel Use a FIFOCache instead of an LRUCache, and factor out elt.get	511	ie.symlink_target = elt_get('symlink_target')
1399.1.8 by Robert Collins factor out inventory directory logic into 'InventoryDirectory' class	512	else:
2100.3.1 by Aaron Bentley Start roundtripping tree-reference entries	513	raise errors.UnsupportedInventoryKind(kind)
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	514	ie.revision = revision
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	515	if revision is not None and entry_cache is not None:
3882.6.21 by John Arbash Meinel Don't cache the InventoryEntry we will return, callers mutate those objects.	516	# We cache a copy() because callers like to mutate objects, and
	517	# that would cause the item in cache to mutate as well.
	518	# This has a small effect on many-inventory performance, because
	519	# the majority fraction is spent in cache hits, not misses.
3882.6.22 by John Arbash Meinel Start moving things around so that the entry cache is passed in.	520	entry_cache[key] = ie.copy()
1189 by Martin Pool - BROKEN: partial support for commit into weave	521
	522	return ie
	523
	524	def _unpack_revision(self, elt):
	525	"""XML Element -> Revision object"""
1393.1.59 by Martin Pool - put 'format=5' on inventory and revision xml	526	format = elt.get('format')
3311.3.3 by Aaron Bentley Handle format 5 revision	527	format_num = self.format_num
	528	if self.revision_format_num is not None:
	529	format_num = self.revision_format_num
1393.1.59 by Martin Pool - put 'format=5' on inventory and revision xml	530	if format is not None:
3311.3.3 by Aaron Bentley Handle format 5 revision	531	if format != format_num:
3311.3.3 by Aaron Bentley Handle format 5 revision	532	raise BzrError("invalid format version %r on revision"
1393.1.59 by Martin Pool - put 'format=5' on inventory and revision xml	533	% format)
2249.5.4 by John Arbash Meinel When reading XML, always return utf-8 revision ids.	534	get_cached = _get_utf8_or_ascii
1189 by Martin Pool - BROKEN: partial support for commit into weave	535	rev = Revision(committer = elt.get('committer'),
	536	timestamp = float(elt.get('timestamp')),
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	537	revision_id = get_cached(elt.get('revision_id')),
1189 by Martin Pool - BROKEN: partial support for commit into weave	538	inventory_sha1 = elt.get('inventory_sha1')
	539	)
	540	parents = elt.find('parents') or []
	541	for p in parents:
1911.2.6 by John Arbash Meinel Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly.	542	rev.parent_ids.append(get_cached(p.get('revision_id')))
1185.16.37 by Martin Pool - properties are retrieved when revisions are loaded	543	self._unpack_revision_properties(elt, rev)
1189 by Martin Pool - BROKEN: partial support for commit into weave	544	v = elt.get('timezone')
1913.1.1 by John Arbash Meinel Fix bug #55783	545	if v is None:
	546	rev.timezone = 0
	547	else:
	548	rev.timezone = int(v)
1189 by Martin Pool - BROKEN: partial support for commit into weave	549	rev.message = elt.findtext('message') # text of <message>
	550	return rev
	551
1185.16.37 by Martin Pool - properties are retrieved when revisions are loaded	552	def _unpack_revision_properties(self, elt, rev):
	553	"""Unpack properties onto a revision."""
	554	props_elt = elt.find('properties')
	555	if not props_elt:
	556	return
	557	for prop_elt in props_elt:
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	558	if prop_elt.tag != 'property':
	559	raise AssertionError(
	560	"bad tag under properties list: %r" % prop_elt.tag)
1185.16.37 by Martin Pool - properties are retrieved when revisions are loaded	561	name = prop_elt.get('name')
	562	value = prop_elt.text
1886.1.1 by John Arbash Meinel Fix bug #47782,	563	# If a property had an empty value ('') cElementTree reads
	564	# that back as None, convert it back to '', so that all
	565	# properties have string values
	566	if value is None:
	567	value = ''
3376.2.4 by Martin Pool Remove every assert statement from bzrlib!	568	if name in rev.properties:
	569	raise AssertionError("repeated property %r" % name)
1185.16.37 by Martin Pool - properties are retrieved when revisions are loaded	570	rev.properties[name] = value
	571
5671.2.3 by Jelmer Vernooij Move Repository._find_text_key_references_from_xml_inventory_lines onto the serializer.	572	def _find_text_key_references(self, line_iterator):
	573	"""Core routine for extracting references to texts from inventories.
	574
	575	This performs the translation of xml lines to revision ids.
	576
	577	:param line_iterator: An iterator of lines, origin_version_id
	578	:return: A dictionary mapping text keys ((fileid, revision_id) tuples)
	579	to whether they were referred to by the inventory of the
	580	revision_id that they contain. Note that if that revision_id was
	581	not part of the line_iterator's output then False will be given -
	582	even though it may actually refer to that key.
	583	"""
	584	if not self.support_altered_by_hack:
	585	raise AssertionError(
	586	"_find_text_key_references only "
	587	"supported for branches which store inventory as unnested xml"
	588	", not on %r" % self)
	589	result = {}
	590
	591	# this code needs to read every new line in every inventory for the
	592	# inventories [revision_ids]. Seeing a line twice is ok. Seeing a line
	593	# not present in one of those inventories is unnecessary but not
	594	# harmful because we are filtering by the revision id marker in the
	595	# inventory lines : we only select file ids altered in one of those
	596	# revisions. We don't need to see all lines in the inventory because
	597	# only those added in an inventory in rev X can contain a revision=X
	598	# line.
	599	unescape_revid_cache = {}
	600	unescape_fileid_cache = {}
	601
	602	# jam 20061218 In a big fetch, this handles hundreds of thousands
	603	# of lines, so it has had a lot of inlining and optimizing done.
	604	# Sorry that it is a little bit messy.
	605	# Move several functions to be local variables, since this is a long
	606	# running loop.
	607	search = self._file_ids_altered_regex.search
	608	unescape = _unescape_xml
	609	setdefault = result.setdefault
	610	for line, line_key in line_iterator:
	611	match = search(line)
	612	if match is None:
	613	continue
	614	# One call to match.group() returning multiple items is quite a
	615	# bit faster than 2 calls to match.group() each returning 1
	616	file_id, revision_id = match.group('file_id', 'revision_id')
	617
	618	# Inlining the cache lookups helps a lot when you make 170,000
	619	# lines and 350k ids, versus 8.4 unique ids.
	620	# Using a cache helps in 2 ways:
	621	# 1) Avoids unnecessary decoding calls
	622	# 2) Re-uses cached strings, which helps in future set and
	623	# equality checks.
	624	# (2) is enough that removing encoding entirely along with
	625	# the cache (so we are using plain strings) results in no
	626	# performance improvement.
	627	try:
	628	revision_id = unescape_revid_cache[revision_id]
	629	except KeyError:
	630	unescaped = unescape(revision_id)
	631	unescape_revid_cache[revision_id] = unescaped
	632	revision_id = unescaped
	633
	634	# Note that unconditionally unescaping means that we deserialise
	635	# every fileid, which for general 'pull' is not great, but we don't
636	# really want to have some many fulltexts that this matters anyway.
637	# RBC 20071114.
638	try:
639	file_id = unescape_fileid_cache[file_id]
640	except KeyError:
641	unescaped = unescape(file_id)
642	unescape_fileid_cache[file_id] = unescaped
643	file_id = unescaped
644
645	key = (file_id, revision_id)
646	setdefault(key, False)
647	if revision_id == line_key[-1]:
648	result[key] = True
649	return result
650
1185.16.37 by Martin Pool - properties are retrieved when revisions are loaded	651
3311.3.4 by Aaron Bentley Have xml5 inherit from xml6 from xml8	652	serializer_v8 = Serializer_v8()