3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
1 |
# Copyright (C) 2005, 2006, 2007, 2008 Canonical Ltd
|
1773.4.1
by Martin Pool
Add pyflakes makefile target; fix many warnings |
2 |
#
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
3 |
# This program is free software; you can redistribute it and/or modify
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
1887.1.1
by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines, |
7 |
#
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
8 |
# This program is distributed in the hope that it will be useful,
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
1887.1.1
by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines, |
12 |
#
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
13 |
# You should have received a copy of the GNU General Public License
|
14 |
# along with this program; if not, write to the Free Software
|
|
4183.7.1
by Sabin Iacob
update FSF mailing address |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
16 |
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
17 |
import cStringIO |
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
18 |
import re |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
19 |
|
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
20 |
from bzrlib import ( |
21 |
cache_utf8, |
|
2100.3.1
by Aaron Bentley
Start roundtripping tree-reference entries |
22 |
errors, |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
23 |
inventory, |
2598.5.2
by Aaron Bentley
Got all tests passing with Branch returning 'null:' for null revision |
24 |
revision as _mod_revision, |
3882.6.3
by John Arbash Meinel
If we are going to thrash the inventory entry cache, increase its size. |
25 |
trace, |
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
26 |
)
|
4237.3.1
by Jelmer Vernooij
Add new module with generic serializer information; keep XML-specific bits in |
27 |
from bzrlib.xml_serializer import ( |
28 |
Element, |
|
29 |
SubElement, |
|
30 |
XMLSerializer, |
|
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
31 |
escape_invalid_chars, |
4237.3.1
by Jelmer Vernooij
Add new module with generic serializer information; keep XML-specific bits in |
32 |
)
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
33 |
from bzrlib.inventory import ROOT_ID, Inventory, InventoryEntry |
1773.4.1
by Martin Pool
Add pyflakes makefile target; fix many warnings |
34 |
from bzrlib.revision import Revision |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
35 |
from bzrlib.errors import BzrError |
36 |
||
37 |
||
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
38 |
_utf8_re = None |
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
39 |
_unicode_re = None |
40 |
_xml_escape_map = { |
|
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
41 |
"&":'&', |
42 |
"'":"'", # FIXME: overkill |
|
43 |
"\"":""", |
|
44 |
"<":"<", |
|
45 |
">":">", |
|
46 |
}
|
|
47 |
||
48 |
||
49 |
def _ensure_utf8_re(): |
|
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
50 |
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
|
51 |
global _utf8_re, _unicode_re |
|
52 |
if _utf8_re is None: |
|
53 |
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+') |
|
54 |
if _unicode_re is None: |
|
55 |
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]') |
|
56 |
||
57 |
||
58 |
def _unicode_escape_replace(match, _map=_xml_escape_map): |
|
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
59 |
"""Replace a string of non-ascii, non XML safe characters with their escape
|
60 |
||
61 |
This will escape both Standard XML escapes, like <>"', etc.
|
|
62 |
As well as escaping non ascii characters, because ElementTree did.
|
|
63 |
This helps us remain compatible to older versions of bzr. We may change
|
|
64 |
our policy in the future, though.
|
|
65 |
"""
|
|
1934.1.7
by John Arbash Meinel
Comment why we do caching the way we do |
66 |
# jam 20060816 Benchmarks show that try/KeyError is faster if you
|
67 |
# expect the entity to rarely miss. There is about a 10% difference
|
|
68 |
# in overall time. But if you miss frequently, then if None is much
|
|
69 |
# faster. For our use case, we *rarely* have a revision id, file id
|
|
70 |
# or path name that is unicode. So use try/KeyError.
|
|
1934.1.12
by John Arbash Meinel
Switch back to using Entity serializer, since performance is equivalent, yet still compatible |
71 |
try: |
72 |
return _map[match.group()] |
|
73 |
except KeyError: |
|
74 |
return "&#%d;" % ord(match.group()) |
|
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
75 |
|
76 |
||
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
77 |
def _utf8_escape_replace(match, _map=_xml_escape_map): |
78 |
"""Escape utf8 characters into XML safe ones.
|
|
79 |
||
80 |
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
|
|
81 |
or it is handling characters with the high-bit set. For ascii characters,
|
|
82 |
we just lookup the replacement in the dictionary. For everything else, we
|
|
83 |
decode back into Unicode, and then use the XML escape code.
|
|
84 |
"""
|
|
85 |
try: |
|
86 |
return _map[match.group()] |
|
87 |
except KeyError: |
|
88 |
return ''.join('&#%d;' % ord(uni_chr) |
|
89 |
for uni_chr in match.group().decode('utf8')) |
|
90 |
||
91 |
||
92 |
_to_escaped_map = {} |
|
93 |
||
94 |
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map): |
|
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
95 |
"""Encode the string into utf8, and escape invalid XML characters"""
|
1934.1.7
by John Arbash Meinel
Comment why we do caching the way we do |
96 |
# We frequently get entities we have not seen before, so it is better
|
97 |
# to check if None, rather than try/KeyError
|
|
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
98 |
text = _map.get(unicode_or_utf8_str) |
1934.1.5
by John Arbash Meinel
Cache the entity escaping cuts us down to 450ms |
99 |
if text is None: |
4088.3.1
by Benjamin Peterson
compare types with 'is' not == |
100 |
if unicode_or_utf8_str.__class__ is unicode: |
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
101 |
# The alternative policy is to do a regular UTF8 encoding
|
102 |
# and then escape only XML meta characters.
|
|
103 |
# Performance is equivalent once you use cache_utf8. *However*
|
|
104 |
# this makes the serialized texts incompatible with old versions
|
|
105 |
# of bzr. So no net gain. (Perhaps the read code would handle utf8
|
|
106 |
# better than entity escapes, but cElementTree seems to do just fine
|
|
107 |
# either way)
|
|
108 |
text = str(_unicode_re.sub(_unicode_escape_replace, |
|
109 |
unicode_or_utf8_str)) + '"' |
|
110 |
else: |
|
111 |
# Plain strings are considered to already be in utf-8 so we do a
|
|
112 |
# slightly different method for escaping.
|
|
113 |
text = _utf8_re.sub(_utf8_escape_replace, |
|
114 |
unicode_or_utf8_str) + '"' |
|
115 |
_map[unicode_or_utf8_str] = text |
|
1934.1.5
by John Arbash Meinel
Cache the entity escaping cuts us down to 450ms |
116 |
return text |
117 |
||
118 |
||
2249.5.4
by John Arbash Meinel
When reading XML, always return utf-8 revision ids. |
119 |
def _get_utf8_or_ascii(a_str, |
120 |
_encode_utf8=cache_utf8.encode, |
|
121 |
_get_cached_ascii=cache_utf8.get_cached_ascii): |
|
122 |
"""Return a cached version of the string.
|
|
123 |
||
124 |
cElementTree will return a plain string if the XML is plain ascii. It only
|
|
125 |
returns Unicode when it needs to. We want to work in utf-8 strings. So if
|
|
126 |
cElementTree returns a plain string, we can just return the cached version.
|
|
127 |
If it is Unicode, then we need to encode it.
|
|
128 |
||
129 |
:param a_str: An 8-bit string or Unicode as returned by
|
|
130 |
cElementTree.Element.get()
|
|
131 |
:return: A utf-8 encoded 8-bit string.
|
|
132 |
"""
|
|
133 |
# This is fairly optimized because we know what cElementTree does, this is
|
|
134 |
# not meant as a generic function for all cases. Because it is possible for
|
|
135 |
# an 8-bit string to not be ascii or valid utf8.
|
|
4088.3.1
by Benjamin Peterson
compare types with 'is' not == |
136 |
if a_str.__class__ is unicode: |
2249.5.4
by John Arbash Meinel
When reading XML, always return utf-8 revision ids. |
137 |
return _encode_utf8(a_str) |
138 |
else: |
|
4075.3.2
by John Arbash Meinel
Use intern() instead of _get_cached_ascii for getting unique revision_ids and file_ids. |
139 |
return intern(a_str) |
2249.5.4
by John Arbash Meinel
When reading XML, always return utf-8 revision ids. |
140 |
|
141 |
||
1934.1.5
by John Arbash Meinel
Cache the entity escaping cuts us down to 450ms |
142 |
def _clear_cache(): |
143 |
"""Clean out the unicode => escaped map"""
|
|
2249.5.10
by John Arbash Meinel
Make sure xml5 can handle unicode or utf8 strings |
144 |
_to_escaped_map.clear() |
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
145 |
|
146 |
||
4237.3.1
by Jelmer Vernooij
Add new module with generic serializer information; keep XML-specific bits in |
147 |
class Serializer_v8(XMLSerializer): |
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
148 |
"""This serialiser adds rich roots.
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
149 |
|
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
150 |
Its revision format number matches its inventory number.
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
151 |
"""
|
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
152 |
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
153 |
__slots__ = [] |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
154 |
|
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
155 |
root_id = None |
1910.2.48
by Aaron Bentley
Update from review comments |
156 |
support_altered_by_hack = True |
157 |
# This format supports the altered-by hack that reads file ids directly out
|
|
158 |
# of the versionedfile, without doing XML parsing.
|
|
159 |
||
2100.3.1
by Aaron Bentley
Start roundtripping tree-reference entries |
160 |
supported_kinds = set(['file', 'directory', 'symlink']) |
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
161 |
format_num = '8' |
3311.3.3
by Aaron Bentley
Handle format 5 revision |
162 |
revision_format_num = None |
2100.3.1
by Aaron Bentley
Start roundtripping tree-reference entries |
163 |
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
164 |
def _check_revisions(self, inv): |
165 |
"""Extension point for subclasses to check during serialisation.
|
|
166 |
||
167 |
:param inv: An inventory about to be serialised, to be checked.
|
|
4031.3.1
by Frank Aspell
Fixing various typos |
168 |
:raises: AssertionError if an error has occurred.
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
169 |
"""
|
3376.2.4
by Martin Pool
Remove every assert statement from bzrlib! |
170 |
if inv.revision_id is None: |
171 |
raise AssertionError() |
|
172 |
if inv.root.revision is None: |
|
173 |
raise AssertionError() |
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
174 |
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
175 |
def _check_cache_size(self, inv_size, entry_cache): |
176 |
"""Check that the entry_cache is large enough.
|
|
3882.6.12
by John Arbash Meinel
Use resize logic to ensure our inventory entry cache is at an optimal size. |
177 |
|
178 |
We want the cache to be ~2x the size of an inventory. The reason is
|
|
179 |
because we use a FIFO cache, and how Inventory records are likely to
|
|
180 |
change. In general, you have a small number of records which change
|
|
181 |
often, and a lot of records which do not change at all. So when the
|
|
182 |
cache gets full, you actually flush out a lot of the records you are
|
|
183 |
interested in, which means you need to recreate all of those records.
|
|
184 |
An LRU Cache would be better, but the overhead negates the cache
|
|
185 |
coherency benefit.
|
|
186 |
||
187 |
One way to look at it, only the size of the cache > len(inv) is your
|
|
188 |
'working' set. And in general, it shouldn't be a problem to hold 2
|
|
189 |
inventories in memory anyway.
|
|
190 |
||
191 |
:param inv_size: The number of entries in an inventory.
|
|
192 |
"""
|
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
193 |
if entry_cache is None: |
194 |
return
|
|
3882.6.12
by John Arbash Meinel
Use resize logic to ensure our inventory entry cache is at an optimal size. |
195 |
# 1.5 times might also be reasonable.
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
196 |
recommended_min_cache_size = inv_size * 1.5 |
197 |
if entry_cache.cache_size() < recommended_min_cache_size: |
|
198 |
recommended_cache_size = inv_size * 2 |
|
199 |
trace.mutter('Resizing the inventory entry cache from %d to %d', |
|
200 |
entry_cache.cache_size(), recommended_cache_size) |
|
201 |
entry_cache.resize(recommended_cache_size) |
|
3882.6.12
by John Arbash Meinel
Use resize logic to ensure our inventory entry cache is at an optimal size. |
202 |
|
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
203 |
def write_inventory_to_lines(self, inv): |
204 |
"""Return a list of lines with the encoded inventory."""
|
|
205 |
return self.write_inventory(inv, None) |
|
206 |
||
207 |
def write_inventory_to_string(self, inv, working=False): |
|
208 |
"""Just call write_inventory with a StringIO and return the value.
|
|
209 |
||
210 |
:param working: If True skip history data - text_sha1, text_size,
|
|
211 |
reference_revision, symlink_target.
|
|
212 |
"""
|
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
213 |
sio = cStringIO.StringIO() |
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
214 |
self.write_inventory(inv, sio, working) |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
215 |
return sio.getvalue() |
216 |
||
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
217 |
def write_inventory(self, inv, f, working=False): |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
218 |
"""Write inventory to a file.
|
3943.8.1
by Marius Kruger
remove all trailing whitespace from bzr source |
219 |
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
220 |
:param inv: the inventory to write.
|
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
221 |
:param f: the file to write. (May be None if the lines are the desired
|
222 |
output).
|
|
223 |
:param working: If True skip history data - text_sha1, text_size,
|
|
224 |
reference_revision, symlink_target.
|
|
225 |
:return: The inventory as a list of lines.
|
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
226 |
"""
|
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
227 |
_ensure_utf8_re() |
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
228 |
self._check_revisions(inv) |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
229 |
output = [] |
1934.1.8
by John Arbash Meinel
Passing around the append function rather than the list shaves off another 10%, down to 400ms |
230 |
append = output.append |
231 |
self._append_inventory_root(append, inv) |
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
232 |
entries = inv.iter_entries() |
1934.1.4
by John Arbash Meinel
rewrite escaper to use xml numerical entities, rather than using encode('utf8') |
233 |
# Skip the root
|
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
234 |
root_path, root_ie = entries.next() |
235 |
for path, ie in entries: |
|
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
236 |
if ie.parent_id != self.root_id: |
237 |
parent_str = ' parent_id="' |
|
238 |
parent_id = _encode_and_escape(ie.parent_id) |
|
239 |
else: |
|
240 |
parent_str = '' |
|
241 |
parent_id = '' |
|
242 |
if ie.kind == 'file': |
|
243 |
if ie.executable: |
|
244 |
executable = ' executable="yes"' |
|
245 |
else: |
|
246 |
executable = '' |
|
247 |
if not working: |
|
248 |
append('<file%s file_id="%s name="%s%s%s revision="%s ' |
|
249 |
'text_sha1="%s" text_size="%d" />\n' % ( |
|
250 |
executable, _encode_and_escape(ie.file_id), |
|
251 |
_encode_and_escape(ie.name), parent_str, parent_id, |
|
252 |
_encode_and_escape(ie.revision), ie.text_sha1, |
|
253 |
ie.text_size)) |
|
254 |
else: |
|
255 |
append('<file%s file_id="%s name="%s%s%s />\n' % ( |
|
256 |
executable, _encode_and_escape(ie.file_id), |
|
257 |
_encode_and_escape(ie.name), parent_str, parent_id)) |
|
258 |
elif ie.kind == 'directory': |
|
259 |
if not working: |
|
260 |
append('<directory file_id="%s name="%s%s%s revision="%s ' |
|
261 |
'/>\n' % ( |
|
262 |
_encode_and_escape(ie.file_id), |
|
263 |
_encode_and_escape(ie.name), |
|
264 |
parent_str, parent_id, |
|
265 |
_encode_and_escape(ie.revision))) |
|
266 |
else: |
|
267 |
append('<directory file_id="%s name="%s%s%s />\n' % ( |
|
268 |
_encode_and_escape(ie.file_id), |
|
269 |
_encode_and_escape(ie.name), |
|
270 |
parent_str, parent_id)) |
|
271 |
elif ie.kind == 'symlink': |
|
272 |
if not working: |
|
273 |
append('<symlink file_id="%s name="%s%s%s revision="%s ' |
|
274 |
'symlink_target="%s />\n' % ( |
|
275 |
_encode_and_escape(ie.file_id), |
|
276 |
_encode_and_escape(ie.name), |
|
277 |
parent_str, parent_id, |
|
278 |
_encode_and_escape(ie.revision), |
|
279 |
_encode_and_escape(ie.symlink_target))) |
|
280 |
else: |
|
281 |
append('<symlink file_id="%s name="%s%s%s />\n' % ( |
|
282 |
_encode_and_escape(ie.file_id), |
|
283 |
_encode_and_escape(ie.name), |
|
284 |
parent_str, parent_id)) |
|
285 |
elif ie.kind == 'tree-reference': |
|
286 |
if ie.kind not in self.supported_kinds: |
|
287 |
raise errors.UnsupportedInventoryKind(ie.kind) |
|
288 |
if not working: |
|
289 |
append('<tree-reference file_id="%s name="%s%s%s ' |
|
290 |
'revision="%s reference_revision="%s />\n' % ( |
|
291 |
_encode_and_escape(ie.file_id), |
|
292 |
_encode_and_escape(ie.name), |
|
293 |
parent_str, parent_id, |
|
294 |
_encode_and_escape(ie.revision), |
|
295 |
_encode_and_escape(ie.reference_revision))) |
|
296 |
else: |
|
297 |
append('<tree-reference file_id="%s name="%s%s%s />\n' % ( |
|
298 |
_encode_and_escape(ie.file_id), |
|
299 |
_encode_and_escape(ie.name), |
|
300 |
parent_str, parent_id)) |
|
301 |
else: |
|
302 |
raise errors.UnsupportedInventoryKind(ie.kind) |
|
1934.1.8
by John Arbash Meinel
Passing around the append function rather than the list shaves off another 10%, down to 400ms |
303 |
append('</inventory>\n') |
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
304 |
if f is not None: |
305 |
f.writelines(output) |
|
1934.1.5
by John Arbash Meinel
Cache the entity escaping cuts us down to 450ms |
306 |
# Just to keep the cache from growing without bounds
|
307 |
# but we may actually not want to do clear the cache
|
|
1934.1.6
by John Arbash Meinel
With a full cache the time is down to 381 ms |
308 |
#_clear_cache()
|
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
309 |
return output |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
310 |
|
1934.1.8
by John Arbash Meinel
Passing around the append function rather than the list shaves off another 10%, down to 400ms |
311 |
def _append_inventory_root(self, append, inv): |
1934.1.3
by John Arbash Meinel
[merge] robert's custom XML serializer, and cleanup for benchmarks and iter_entries() differences |
312 |
"""Append the inventory root to output."""
|
313 |
if inv.revision_id is not None: |
|
2817.2.1
by Robert Collins
* Inventory serialisation no longer double-sha's the content. |
314 |
revid1 = ' revision_id="' |
315 |
revid2 = _encode_and_escape(inv.revision_id) |
|
316 |
else: |
|
317 |
revid1 = "" |
|
318 |
revid2 = "" |
|
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
319 |
append('<inventory format="%s"%s%s>\n' % ( |
320 |
self.format_num, revid1, revid2)) |
|
321 |
append('<directory file_id="%s name="%s revision="%s />\n' % ( |
|
322 |
_encode_and_escape(inv.root.file_id), |
|
323 |
_encode_and_escape(inv.root.name), |
|
324 |
_encode_and_escape(inv.root.revision))) |
|
325 |
||
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
326 |
def _pack_revision(self, rev): |
327 |
"""Revision object -> xml tree"""
|
|
2249.5.5
by John Arbash Meinel
better comment for why we are decoding |
328 |
# For the XML format, we need to write them as Unicode rather than as
|
329 |
# utf-8 strings. So that cElementTree can handle properly escaping
|
|
330 |
# them.
|
|
2249.5.4
by John Arbash Meinel
When reading XML, always return utf-8 revision ids. |
331 |
decode_utf8 = cache_utf8.decode |
2249.5.5
by John Arbash Meinel
better comment for why we are decoding |
332 |
revision_id = rev.revision_id |
333 |
if isinstance(revision_id, str): |
|
334 |
revision_id = decode_utf8(revision_id) |
|
3311.3.3
by Aaron Bentley
Handle format 5 revision |
335 |
format_num = self.format_num |
336 |
if self.revision_format_num is not None: |
|
337 |
format_num = self.revision_format_num |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
338 |
root = Element('revision', |
339 |
committer = rev.committer, |
|
2102.4.1
by John Arbash Meinel
Switch to using millisecond resolution in Revision XML |
340 |
timestamp = '%.3f' % rev.timestamp, |
2249.5.5
by John Arbash Meinel
better comment for why we are decoding |
341 |
revision_id = revision_id, |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
342 |
inventory_sha1 = rev.inventory_sha1, |
3311.3.3
by Aaron Bentley
Handle format 5 revision |
343 |
format=format_num, |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
344 |
)
|
1913.1.1
by John Arbash Meinel
Fix bug #55783 |
345 |
if rev.timezone is not None: |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
346 |
root.set('timezone', str(rev.timezone)) |
347 |
root.text = '\n' |
|
348 |
msg = SubElement(root, 'message') |
|
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
349 |
msg.text = escape_invalid_chars(rev.message)[0] |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
350 |
msg.tail = '\n' |
1313
by Martin Pool
- rename to Revision.parent_ids to avoid confusion with old usage |
351 |
if rev.parent_ids: |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
352 |
pelts = SubElement(root, 'parents') |
353 |
pelts.tail = pelts.text = '\n' |
|
1313
by Martin Pool
- rename to Revision.parent_ids to avoid confusion with old usage |
354 |
for parent_id in rev.parent_ids: |
2598.5.2
by Aaron Bentley
Got all tests passing with Branch returning 'null:' for null revision |
355 |
_mod_revision.check_not_reserved_id(parent_id) |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
356 |
p = SubElement(pelts, 'revision_ref') |
357 |
p.tail = '\n' |
|
2249.5.5
by John Arbash Meinel
better comment for why we are decoding |
358 |
if isinstance(parent_id, str): |
359 |
parent_id = decode_utf8(parent_id) |
|
360 |
p.set('revision_id', parent_id) |
|
1185.16.36
by Martin Pool
- store revision properties in revision xml |
361 |
if rev.properties: |
362 |
self._pack_revision_properties(rev, root) |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
363 |
return root |
1185.16.36
by Martin Pool
- store revision properties in revision xml |
364 |
|
365 |
def _pack_revision_properties(self, rev, under_element): |
|
366 |
top_elt = SubElement(under_element, 'properties') |
|
367 |
for prop_name, prop_value in sorted(rev.properties.items()): |
|
368 |
prop_elt = SubElement(top_elt, 'property') |
|
369 |
prop_elt.set('name', prop_name) |
|
370 |
prop_elt.text = prop_value |
|
371 |
prop_elt.tail = '\n' |
|
372 |
top_elt.tail = '\n' |
|
373 |
||
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
374 |
def _unpack_inventory(self, elt, revision_id=None, entry_cache=None): |
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
375 |
"""Construct from XML Element"""
|
376 |
if elt.tag != 'inventory': |
|
377 |
raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag) |
|
1393.1.59
by Martin Pool
- put 'format=5' on inventory and revision xml |
378 |
format = elt.get('format') |
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
379 |
if format != self.format_num: |
380 |
raise errors.UnexpectedInventoryFormat('Invalid format version %r' |
|
381 |
% format) |
|
382 |
revision_id = elt.get('revision_id') |
|
383 |
if revision_id is not None: |
|
384 |
revision_id = cache_utf8.encode(revision_id) |
|
385 |
inv = inventory.Inventory(root_id=None, revision_id=revision_id) |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
386 |
for e in elt: |
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
387 |
ie = self._unpack_entry(e, entry_cache=entry_cache) |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
388 |
inv.add(ie) |
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
389 |
self._check_cache_size(len(inv), entry_cache) |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
390 |
return inv |
391 |
||
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
392 |
def _unpack_entry(self, elt, entry_cache=None): |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
393 |
elt_get = elt.get |
394 |
file_id = elt_get('file_id') |
|
395 |
revision = elt_get('revision') |
|
396 |
# Check and see if we have already unpacked this exact entry
|
|
3882.6.8
by John Arbash Meinel
Add detailed timings on the last 100 mysql revisions. |
397 |
# Some timings for "repo.revision_trees(last_100_revs)"
|
398 |
# bzr mysql
|
|
399 |
# unmodified 4.1s 40.8s
|
|
3882.6.6
by John Arbash Meinel
Add some actual timings, supporting why we use a FIFOCache. |
400 |
# using lru 3.5s
|
3882.6.8
by John Arbash Meinel
Add detailed timings on the last 100 mysql revisions. |
401 |
# using fifo 2.83s 29.1s
|
3882.6.6
by John Arbash Meinel
Add some actual timings, supporting why we use a FIFOCache. |
402 |
# lru._cache 2.8s
|
3882.6.8
by John Arbash Meinel
Add detailed timings on the last 100 mysql revisions. |
403 |
# dict 2.75s 26.8s
|
404 |
# inv.add 2.5s 26.0s
|
|
405 |
# no_copy 2.00s 20.5s
|
|
406 |
# no_c,dict 1.95s 18.0s
|
|
3882.6.6
by John Arbash Meinel
Add some actual timings, supporting why we use a FIFOCache. |
407 |
# Note that a cache of 10k nodes is more than sufficient to hold all of
|
3882.6.9
by John Arbash Meinel
Add some more direct timings using time.clock() instead of lsprof. |
408 |
# the inventory for the last 100 revs for bzr, but not for mysql (20k
|
409 |
# is enough for mysql, which saves the same 2s as using a dict)
|
|
410 |
||
411 |
# Breakdown of mysql using time.clock()
|
|
412 |
# 4.1s 2 calls to element.get for file_id, revision_id
|
|
413 |
# 4.5s cache_hit lookup
|
|
414 |
# 7.1s InventoryFile.copy()
|
|
415 |
# 2.4s InventoryDirectory.copy()
|
|
416 |
# 0.4s decoding unique entries
|
|
3882.6.11
by John Arbash Meinel
comment update |
417 |
# 1.6s decoding entries after FIFO fills up
|
3882.6.9
by John Arbash Meinel
Add some more direct timings using time.clock() instead of lsprof. |
418 |
# 0.8s Adding nodes to FIFO (including flushes)
|
419 |
# 0.1s cache miss lookups
|
|
420 |
# Using an LRU cache
|
|
421 |
# 4.1s 2 calls to element.get for file_id, revision_id
|
|
422 |
# 9.9s cache_hit lookup
|
|
423 |
# 10.8s InventoryEntry.copy()
|
|
424 |
# 0.3s cache miss lookus
|
|
425 |
# 1.2s decoding entries
|
|
426 |
# 1.0s adding nodes to LRU
|
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
427 |
if entry_cache is not None and revision is not None: |
428 |
key = (file_id, revision) |
|
429 |
try: |
|
4031.3.1
by Frank Aspell
Fixing various typos |
430 |
# We copy it, because some operations may mutate it
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
431 |
cached_ie = entry_cache[key] |
432 |
except KeyError: |
|
433 |
pass
|
|
434 |
else: |
|
435 |
# Only copying directory entries drops us 2.85s => 2.35s
|
|
436 |
# if cached_ie.kind == 'directory':
|
|
437 |
# return cached_ie.copy()
|
|
438 |
# return cached_ie
|
|
439 |
return cached_ie.copy() |
|
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
440 |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
441 |
kind = elt.tag |
1399.1.6
by Robert Collins
move exporting functionality into inventory.py - uncovers bug in symlink support |
442 |
if not InventoryEntry.versionable_kind(kind): |
1092.2.20
by Robert Collins
symlink and weaves, whaddya know |
443 |
raise AssertionError('unsupported entry kind %s' % kind) |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
444 |
|
3882.6.13
by John Arbash Meinel
We don't need to inline get_cached until we've had the miss. |
445 |
get_cached = _get_utf8_or_ascii |
446 |
||
3882.6.1
by John Arbash Meinel
Add an InventoryEntry cache to the xml deserializer. |
447 |
file_id = get_cached(file_id) |
448 |
if revision is not None: |
|
449 |
revision = get_cached(revision) |
|
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
450 |
parent_id = elt_get('parent_id') |
2294.1.10
by John Arbash Meinel
Switch all apis over to utf8 file ids. All tests pass |
451 |
if parent_id is not None: |
452 |
parent_id = get_cached(parent_id) |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
453 |
|
1399.1.8
by Robert Collins
factor out inventory directory logic into 'InventoryDirectory' class |
454 |
if kind == 'directory': |
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
455 |
ie = inventory.InventoryDirectory(file_id, |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
456 |
elt_get('name'), |
1399.1.8
by Robert Collins
factor out inventory directory logic into 'InventoryDirectory' class |
457 |
parent_id) |
1399.1.9
by Robert Collins
factor out file related logic from InventoryEntry to InventoryFile |
458 |
elif kind == 'file': |
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
459 |
ie = inventory.InventoryFile(file_id, |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
460 |
elt_get('name'), |
1399.1.9
by Robert Collins
factor out file related logic from InventoryEntry to InventoryFile |
461 |
parent_id) |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
462 |
ie.text_sha1 = elt_get('text_sha1') |
463 |
if elt_get('executable') == 'yes': |
|
1399.1.9
by Robert Collins
factor out file related logic from InventoryEntry to InventoryFile |
464 |
ie.executable = True |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
465 |
v = elt_get('text_size') |
1399.1.9
by Robert Collins
factor out file related logic from InventoryEntry to InventoryFile |
466 |
ie.text_size = v and int(v) |
1399.1.10
by Robert Collins
remove kind from the InventoryEntry constructor - only child classes should be created now |
467 |
elif kind == 'symlink': |
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
468 |
ie = inventory.InventoryLink(file_id, |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
469 |
elt_get('name'), |
1399.1.10
by Robert Collins
remove kind from the InventoryEntry constructor - only child classes should be created now |
470 |
parent_id) |
3882.6.5
by John Arbash Meinel
Use a FIFOCache instead of an LRUCache, and factor out elt.get |
471 |
ie.symlink_target = elt_get('symlink_target') |
1399.1.8
by Robert Collins
factor out inventory directory logic into 'InventoryDirectory' class |
472 |
else: |
2100.3.1
by Aaron Bentley
Start roundtripping tree-reference entries |
473 |
raise errors.UnsupportedInventoryKind(kind) |
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
474 |
ie.revision = revision |
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
475 |
if revision is not None and entry_cache is not None: |
3882.6.21
by John Arbash Meinel
Don't cache the InventoryEntry we will return, callers mutate those objects. |
476 |
# We cache a copy() because callers like to mutate objects, and
|
477 |
# that would cause the item in cache to mutate as well.
|
|
478 |
# This has a small effect on many-inventory performance, because
|
|
479 |
# the majority fraction is spent in cache hits, not misses.
|
|
3882.6.22
by John Arbash Meinel
Start moving things around so that the entry cache is passed in. |
480 |
entry_cache[key] = ie.copy() |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
481 |
|
482 |
return ie |
|
483 |
||
484 |
def _unpack_revision(self, elt): |
|
485 |
"""XML Element -> Revision object"""
|
|
1393.1.59
by Martin Pool
- put 'format=5' on inventory and revision xml |
486 |
format = elt.get('format') |
3311.3.3
by Aaron Bentley
Handle format 5 revision |
487 |
format_num = self.format_num |
488 |
if self.revision_format_num is not None: |
|
489 |
format_num = self.revision_format_num |
|
1393.1.59
by Martin Pool
- put 'format=5' on inventory and revision xml |
490 |
if format is not None: |
3311.3.3
by Aaron Bentley
Handle format 5 revision |
491 |
if format != format_num: |
492 |
raise BzrError("invalid format version %r on revision" |
|
1393.1.59
by Martin Pool
- put 'format=5' on inventory and revision xml |
493 |
% format) |
2249.5.4
by John Arbash Meinel
When reading XML, always return utf-8 revision ids. |
494 |
get_cached = _get_utf8_or_ascii |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
495 |
rev = Revision(committer = elt.get('committer'), |
496 |
timestamp = float(elt.get('timestamp')), |
|
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
497 |
revision_id = get_cached(elt.get('revision_id')), |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
498 |
inventory_sha1 = elt.get('inventory_sha1') |
499 |
)
|
|
500 |
parents = elt.find('parents') or [] |
|
501 |
for p in parents: |
|
1911.2.6
by John Arbash Meinel
Cache revision ids and file ids as part of xml processing. A custom xml parser could just call decode/encode directly. |
502 |
rev.parent_ids.append(get_cached(p.get('revision_id'))) |
1185.16.37
by Martin Pool
- properties are retrieved when revisions are loaded |
503 |
self._unpack_revision_properties(elt, rev) |
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
504 |
v = elt.get('timezone') |
1913.1.1
by John Arbash Meinel
Fix bug #55783 |
505 |
if v is None: |
506 |
rev.timezone = 0 |
|
507 |
else: |
|
508 |
rev.timezone = int(v) |
|
1189
by Martin Pool
- BROKEN: partial support for commit into weave |
509 |
rev.message = elt.findtext('message') # text of <message> |
510 |
return rev |
|
511 |
||
1185.16.37
by Martin Pool
- properties are retrieved when revisions are loaded |
512 |
def _unpack_revision_properties(self, elt, rev): |
513 |
"""Unpack properties onto a revision."""
|
|
514 |
props_elt = elt.find('properties') |
|
515 |
if not props_elt: |
|
516 |
return
|
|
517 |
for prop_elt in props_elt: |
|
3376.2.4
by Martin Pool
Remove every assert statement from bzrlib! |
518 |
if prop_elt.tag != 'property': |
519 |
raise AssertionError( |
|
520 |
"bad tag under properties list: %r" % prop_elt.tag) |
|
1185.16.37
by Martin Pool
- properties are retrieved when revisions are loaded |
521 |
name = prop_elt.get('name') |
522 |
value = prop_elt.text |
|
1886.1.1
by John Arbash Meinel
Fix bug #47782, |
523 |
# If a property had an empty value ('') cElementTree reads
|
524 |
# that back as None, convert it back to '', so that all
|
|
525 |
# properties have string values
|
|
526 |
if value is None: |
|
527 |
value = '' |
|
3376.2.4
by Martin Pool
Remove every assert statement from bzrlib! |
528 |
if name in rev.properties: |
529 |
raise AssertionError("repeated property %r" % name) |
|
1185.16.37
by Martin Pool
- properties are retrieved when revisions are loaded |
530 |
rev.properties[name] = value |
531 |
||
532 |
||
3311.3.4
by Aaron Bentley
Have xml5 inherit from xml6 from xml8 |
533 |
serializer_v8 = Serializer_v8() |