~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml5.py

Committer: John Arbash Meinel
Date: 2006-08-14 16:16:53 UTC
mto: (1946.2.6 reduce-knit-churn)
mto: This revision was merged to the branch mainline in revision 1919.
Revision ID: john@arbash-meinel.com-20060814161653-54cdcdadcd4e9003

Remove bogus entry from BRANCH.TODO

files added:
bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

files removed:
bzrlib/benchmarks/bench_cache_utf8.py

bzrlib/benchmarks/bench_sftp.py

bzrlib/benchmarks/bench_xml.py

bzrlib/benchmarks/tree_creator

bzrlib/benchmarks/tree_creator/__init__.py

bzrlib/benchmarks/tree_creator/heavily_merged.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/benchmarks/tree_creator/simple_many_commit.py

bzrlib/cache_utf8.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_version.py

bzrlib/version.py

tools/rst2html.py

files renamed:
bzrlib/tests/repository_implementations/test_revision.py => bzrlib/tests/repository_implementations/test_revprops.py

files modified:
.bzrignore

Makefile

NEWS

bzrlib/__init__.py

bzrlib/add.py

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/bzrdir.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/gpg.py

bzrlib/help.py

bzrlib/inventory.py

bzrlib/knit.py

bzrlib/lsprof.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/revisiontree.py

bzrlib/testament.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_xml.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/transport/__init__.py

bzrlib/transport/sftp.py

bzrlib/tuned_gzip.py

bzrlib/workingtree.py

bzrlib/xml5.py

doc/tutorial.txt

setup.py

tools/doc_generate/autodoc_man.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml5.py

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

import cStringIO

import re

from bzrlib import (

cache_utf8,

inventory,

)

from bzrlib.xml_serializer import SubElement, Element, Serializer

from bzrlib.inventory import ROOT_ID, Inventory, InventoryEntry

import bzrlib.inventory as inventory

from bzrlib.revision import Revision

from bzrlib.errors import BzrError

_utf8_re = None

_utf8_escape_map = {

"&":'&',

"'":"'", # FIXME: overkill

"\"":""",

"<":"<",

">":">",

}

def _ensure_utf8_re():

"""Make sure the _utf8_re regex has been compiled"""

global _utf8_re

if _utf8_re is not None:

return

_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _utf8_escape_replace(match, _map=_utf8_escape_map):

"""Replace a string of non-ascii, non XML safe characters with their escape

This will escape both Standard XML escapes, like <>"', etc.

As well as escaping non ascii characters, because ElementTree did.

This helps us remain compatible to older versions of bzr. We may change

our policy in the future, though.

"""

# jam 20060816 Benchmarks show that try/KeyError is faster if you

# expect the entity to rarely miss. There is about a 10% difference

# in overall time. But if you miss frequently, then if None is much

# faster. For our use case, we *rarely* have a revision id, file id

# or path name that is unicode. So use try/KeyError.

try:

return _map[match.group()]

except KeyError:

return "&#%d;" % ord(match.group())

_unicode_to_escaped_map = {}

def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):

"""Encode the string into utf8, and escape invalid XML characters"""

# We frequently get entities we have not seen before, so it is better

# to check if None, rather than try/KeyError

text = _map.get(unicode_str)

if text is None:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'

_map[unicode_str] = text

return text

def _clear_cache():

"""Clean out the unicode => escaped map"""

_unicode_to_escaped_map.clear()

class Serializer_v5(Serializer):

"""Version 5 serializer

"""

__slots__ = []

100

def write_inventory_to_string(self, inv):

101

"""Just call write_inventory with a StringIO and return the value"""

102

sio = cStringIO.StringIO()

103

self.write_inventory(inv, sio)

104

return sio.getvalue()

105

106

def write_inventory(self, inv, f):

107

"""Write inventory to a file.

108

109

:param inv: the inventory to write.

110

:param f: the file to write.

111

"""

112

_ensure_utf8_re()

113

output = []

114

append = output.append

115

self._append_inventory_root(append, inv)

def _pack_inventory(self, inv):

"""Convert to XML Element"""

116

entries = inv.iter_entries()

117

# Skip the root

118

root_path, root_ie = entries.next()

e = Element('inventory',

format='5')

e.text = '\n'

path, root = entries.next()

if root.file_id not in (None, ROOT_ID):

e.set('file_id', root.file_id)

if inv.revision_id is not None:

e.set('revision_id', inv.revision_id)

119

for path, ie in entries:

120

self._append_entry(append, ie)

121

append('</inventory>\n')

122

f.writelines(output)

123

# Just to keep the cache from growing without bounds

124

# but we may actually not want to do clear the cache

125

#_clear_cache()

e.append(self._pack_entry(ie))

return e

126

127

def _append_inventory_root(self, append, inv):

128

"""Append the inventory root to output."""

129

append('<inventory')

130

if inv.root.file_id not in (None, ROOT_ID):

131

append(' file_id="')

132

append(_encode_and_escape(inv.root.file_id))

133

append(' format="5"')

134

if inv.revision_id is not None:

135

append(' revision_id="')

136

append(_encode_and_escape(inv.revision_id))

137

append('>\n')

138

139

def _append_entry(self, append, ie):

140

"""Convert InventoryEntry to XML element and append to output."""

def _pack_entry(self, ie):

"""Convert InventoryEntry to XML element"""

141

# TODO: should just be a plain assertion

142

assert InventoryEntry.versionable_kind(ie.kind), \

143

'unsupported entry kind %s' % ie.kind

144

145

append("<")

146

append(ie.kind)

if not InventoryEntry.versionable_kind(ie.kind):

raise AssertionError('unsupported entry kind %s' % ie.kind)

e = Element(ie.kind)

e.set('name', ie.name)

e.set('file_id', ie.file_id)

if ie.text_size != None:

e.set('text_size', '%d' % ie.text_size)

for f in ['text_sha1', 'revision', 'symlink_target']:

v = getattr(ie, f)

if v != None:

e.set(f, v)

147

if ie.executable:

148

append(' executable="yes"')

149

append(' file_id="')

150

append(_encode_and_escape(ie.file_id))

151

append(' name="')

152

append(_encode_and_escape(ie.name))

e.set('executable', 'yes')

# to be conservative, we don't externalize the root pointers

# for now, leaving them as null in the xml form. in a future

# version it will be implied by nested elements.

153

if ie.parent_id != ROOT_ID:

154

assert isinstance(ie.parent_id, basestring)

155

append(' parent_id="')

156

append(_encode_and_escape(ie.parent_id))

157

if ie.revision is not None:

158

append(' revision="')

159

append(_encode_and_escape(ie.revision))

160

if ie.symlink_target is not None:

161

append(' symlink_target="')

162

append(_encode_and_escape(ie.symlink_target))

163

if ie.text_sha1 is not None:

164

append(' text_sha1="')

165

append(ie.text_sha1)

166

append('"')

167

if ie.text_size is not None:

168

append(' text_size="%d"' % ie.text_size)

169

append(" />\n")

170

return

e.set('parent_id', ie.parent_id)

e.tail = '\n'

return e

171

172

def _pack_revision(self, rev):

173

"""Revision object -> xml tree"""

178

inventory_sha1 = rev.inventory_sha1,

179

format='5',

180

)

181

if rev.timezone is not None:

if rev.timezone:

182

root.set('timezone', str(rev.timezone))

183

root.text = '\n'

184

msg = SubElement(root, 'message')

196

101

self._pack_revision_properties(rev, root)

197

102

return root

198

103

104

199

105

def _pack_revision_properties(self, rev, under_element):

200

106

top_elt = SubElement(under_element, 'properties')

201

107

for prop_name, prop_value in sorted(rev.properties.items()):

207

113

prop_elt.tail = '\n'

208

114

top_elt.tail = '\n'

209

115

116

210

117

def _unpack_inventory(self, elt):

211

118

"""Construct from XML Element

212

119

"""

218

125

raise BzrError("invalid format version %r on inventory"

219

126

% format)

220

127

revision_id = elt.get('revision_id')

221

if revision_id is not None:

222

revision_id = cache_utf8.get_cached_unicode(revision_id)

223

128

inv = Inventory(root_id, revision_id=revision_id)

224

129

for e in elt:

225

130

ie = self._unpack_entry(e)

228

133

inv.add(ie)

229

134

return inv

230

135

136

231

137

def _unpack_entry(self, elt):

232

138

kind = elt.tag

233

139

if not InventoryEntry.versionable_kind(kind):

234

140

raise AssertionError('unsupported entry kind %s' % kind)

235

141

236

get_cached = cache_utf8.get_cached_unicode

237

238

142

parent_id = elt.get('parent_id')

239

143

if parent_id == None:

240

144

parent_id = ROOT_ID

241

# TODO: jam 20060817 At present, caching file ids costs us too

242

# much time. It slows down overall read performances from

243

# approx 500ms to 700ms. And doesn't improve future reads.

244

# it might be because revision ids and file ids are mixing.

245

# Consider caching *just* the file ids, for a limited period

246

# of time.

247

#parent_id = get_cached(parent_id)

248

#file_id = get_cached(elt.get('file_id'))

249

file_id = elt.get('file_id')

250

145

251

146

if kind == 'directory':

252

ie = inventory.InventoryDirectory(file_id,

147

ie = inventory.InventoryDirectory(elt.get('file_id'),

253

148

elt.get('name'),

254

149

parent_id)

255

150

elif kind == 'file':

256

ie = inventory.InventoryFile(file_id,

151

ie = inventory.InventoryFile(elt.get('file_id'),

257

152

elt.get('name'),

258

153

parent_id)

259

154

ie.text_sha1 = elt.get('text_sha1')

262

157

v = elt.get('text_size')

263

158

ie.text_size = v and int(v)

264

159

elif kind == 'symlink':

265

ie = inventory.InventoryLink(file_id,

160

ie = inventory.InventoryLink(elt.get('file_id'),

266

161

elt.get('name'),

267

162

parent_id)

268

163

ie.symlink_target = elt.get('symlink_target')

269

164

else:

270

165

raise BzrError("unknown kind %r" % kind)

271

revision = elt.get('revision')

272

if revision is not None:

273

revision = get_cached(revision)

274

ie.revision = revision

166

ie.revision = elt.get('revision')

275

167

276

168

return ie

277

169

170

278

171

def _unpack_revision(self, elt):

279

172

"""XML Element -> Revision object"""

280

173

assert elt.tag == 'revision'

283

176

if format != '5':

284

177

raise BzrError("invalid format version %r on inventory"

285

178

% format)

286

get_cached = cache_utf8.get_cached_unicode

287

179

rev = Revision(committer = elt.get('committer'),

288

180

timestamp = float(elt.get('timestamp')),

289

revision_id = get_cached(elt.get('revision_id')),

181

revision_id = elt.get('revision_id'),

290

182

inventory_sha1 = elt.get('inventory_sha1')

291

183

)

292

184

parents = elt.find('parents') or []

293

185

for p in parents:

294

186

assert p.tag == 'revision_ref', \

295

187

"bad parent node tag %r" % p.tag

296

rev.parent_ids.append(get_cached(p.get('revision_id')))

188

rev.parent_ids.append(p.get('revision_id'))

297

189

self._unpack_revision_properties(elt, rev)

298

190

v = elt.get('timezone')

299

if v is None:

300

rev.timezone = 0

301

else:

302

rev.timezone = int(v)

191

rev.timezone = v and int(v)

303

192

rev.message = elt.findtext('message') # text of <message>

304

193

return rev

305

194

195

306

196

def _unpack_revision_properties(self, elt, rev):

307

197

"""Unpack properties onto a revision."""

308

198

props_elt = elt.find('properties')

Older »