~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml5.py

Committer: Canonical.com Patch Queue Manager
Date: 2006-08-17 14:18:30 UTC
mfrom: (1934.1.21 xml_writer)
Revision ID: pqm@pqm.ubuntu.com-20060817141830-383cad75e9090732

(robertc,jam) Custom xml serializer saves a lot of time when serializing inventories

files added:
bzrlib/benchmarks/bench_xml.py

files modified:
NEWS

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/cache_utf8.py

bzrlib/tests/test_xml.py

bzrlib/xml5.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml5.py

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import cStringIO

import re

from bzrlib import (

cache_utf8,

inventory,

)

from bzrlib.xml_serializer import SubElement, Element, Serializer

from bzrlib.inventory import ROOT_ID, Inventory, InventoryEntry

import bzrlib.inventory as inventory

from bzrlib.revision import Revision

from bzrlib.errors import BzrError

_utf8_re = None

_utf8_escape_map = {

"&":'&',

"'":"'", # FIXME: overkill

"\"":""",

"<":"<",

">":">",

}

def _ensure_utf8_re():

"""Make sure the _utf8_re regex has been compiled"""

global _utf8_re

if _utf8_re is not None:

return

_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _utf8_escape_replace(match, _map=_utf8_escape_map):

"""Replace a string of non-ascii, non XML safe characters with their escape

This will escape both Standard XML escapes, like <>"', etc.

As well as escaping non ascii characters, because ElementTree did.

This helps us remain compatible to older versions of bzr. We may change

our policy in the future, though.

"""

# jam 20060816 Benchmarks show that try/KeyError is faster if you

# expect the entity to rarely miss. There is about a 10% difference

# in overall time. But if you miss frequently, then if None is much

# faster. For our use case, we *rarely* have a revision id, file id

# or path name that is unicode. So use try/KeyError.

try:

return _map[match.group()]

except KeyError:

return "&#%d;" % ord(match.group())

_unicode_to_escaped_map = {}

def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):

"""Encode the string into utf8, and escape invalid XML characters"""

# We frequently get entities we have not seen before, so it is better

# to check if None, rather than try/KeyError

text = _map.get(unicode_str)

if text is None:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'

_map[unicode_str] = text

return text

def _clear_cache():

"""Clean out the unicode => escaped map"""

_unicode_to_escaped_map.clear()

class Serializer_v5(Serializer):

"""Version 5 serializer

"""

__slots__ = []

def _pack_inventory(self, inv):

"""Convert to XML Element"""

100

def write_inventory_to_string(self, inv):

101

"""Just call write_inventory with a StringIO and return the value"""

102

sio = cStringIO.StringIO()

103

self.write_inventory(inv, sio)

104

return sio.getvalue()

105

106

def write_inventory(self, inv, f):

107

"""Write inventory to a file.

108

109

:param inv: the inventory to write.

110

:param f: the file to write.

111

"""

112

_ensure_utf8_re()

113

output = []

114

append = output.append

115

self._append_inventory_root(append, inv)

116

entries = inv.iter_entries()

e = Element('inventory',

format='5')

e.text = '\n'

path, root = entries.next()

if root.file_id not in (None, ROOT_ID):

e.set('file_id', root.file_id)

117

# Skip the root

118

root_path, root_ie = entries.next()

119

for path, ie in entries:

120

self._append_entry(append, ie)

121

append('</inventory>\n')

122

f.writelines(output)

123

# Just to keep the cache from growing without bounds

124

# but we may actually not want to do clear the cache

125

#_clear_cache()

126

127

def _append_inventory_root(self, append, inv):

128

"""Append the inventory root to output."""

129

append('<inventory')

130

if inv.root.file_id not in (None, ROOT_ID):

131

append(' file_id="')

132

append(_encode_and_escape(inv.root.file_id))

133

append(' format="5"')

134

if inv.revision_id is not None:

e.set('revision_id', inv.revision_id)

for path, ie in entries:

e.append(self._pack_entry(ie))

return e

def _pack_entry(self, ie):

"""Convert InventoryEntry to XML element"""

135

append(' revision_id="')

136

append(_encode_and_escape(inv.revision_id))

137

append('>\n')

138

139

def _append_entry(self, append, ie):

140

"""Convert InventoryEntry to XML element and append to output."""

141

# TODO: should just be a plain assertion

if not InventoryEntry.versionable_kind(ie.kind):

raise AssertionError('unsupported entry kind %s' % ie.kind)

e = Element(ie.kind)

e.set('name', ie.name)

e.set('file_id', ie.file_id)

if ie.text_size != None:

e.set('text_size', '%d' % ie.text_size)

for f in ['text_sha1', 'revision', 'symlink_target']:

v = getattr(ie, f)

if v != None:

e.set(f, v)

142

assert InventoryEntry.versionable_kind(ie.kind), \

143

'unsupported entry kind %s' % ie.kind

144

145

append("<")

146

append(ie.kind)

147

if ie.executable:

e.set('executable', 'yes')

# to be conservative, we don't externalize the root pointers

# for now, leaving them as null in the xml form. in a future

# version it will be implied by nested elements.

148

append(' executable="yes"')

149

append(' file_id="')

150

append(_encode_and_escape(ie.file_id))

151

append(' name="')

152

append(_encode_and_escape(ie.name))

153

if ie.parent_id != ROOT_ID:

154

assert isinstance(ie.parent_id, basestring)

e.set('parent_id', ie.parent_id)

e.tail = '\n'

return e

155

append(' parent_id="')

156

append(_encode_and_escape(ie.parent_id))

157

if ie.revision is not None:

158

append(' revision="')

159

append(_encode_and_escape(ie.revision))

160

if ie.symlink_target is not None:

161

append(' symlink_target="')

162

append(_encode_and_escape(ie.symlink_target))

163

if ie.text_sha1 is not None:

164

append(' text_sha1="')

165

append(ie.text_sha1)

166

append('"')

167

if ie.text_size is not None:

168

append(' text_size="%d"' % ie.text_size)

169

append(" />\n")

170

return

171

172

def _pack_revision(self, rev):

173

"""Revision object -> xml tree"""

104

196

self._pack_revision_properties(rev, root)

105

197

return root

106

198

107

108

199

def _pack_revision_properties(self, rev, under_element):

109

200

top_elt = SubElement(under_element, 'properties')

110

201

for prop_name, prop_value in sorted(rev.properties.items()):

116

207

prop_elt.tail = '\n'

117

208

top_elt.tail = '\n'

118

209

119

120

210

def _unpack_inventory(self, elt):

121

211

"""Construct from XML Element

122

212

"""

138

228

inv.add(ie)

139

229

return inv

140

230

141

142

231

def _unpack_entry(self, elt):

143

232

kind = elt.tag

144

233

if not InventoryEntry.versionable_kind(kind):

149

238

parent_id = elt.get('parent_id')

150

239

if parent_id == None:

151

240

parent_id = ROOT_ID

152

parent_id = get_cached(parent_id)

153

file_id = get_cached(elt.get('file_id'))

241

# TODO: jam 20060817 At present, caching file ids costs us too

242

# much time. It slows down overall read performances from

243

# approx 500ms to 700ms. And doesn't improve future reads.

244

# it might be because revision ids and file ids are mixing.

245

# Consider caching *just* the file ids, for a limited period

246

# of time.

247

#parent_id = get_cached(parent_id)

248

#file_id = get_cached(elt.get('file_id'))

249

file_id = elt.get('file_id')

154

250

155

251

if kind == 'directory':

156

252

ie = inventory.InventoryDirectory(file_id,

179

275

180

276

return ie

181

277

182

183

278

def _unpack_revision(self, elt):

184

279

"""XML Element -> Revision object"""

185

280

assert elt.tag == 'revision'

208

303

rev.message = elt.findtext('message') # text of <message>

209

304

return rev

210

305

211

212

306

def _unpack_revision_properties(self, elt, rev):

213

307

"""Unpack properties onto a revision."""

214

308

props_elt = elt.find('properties')

Older »