~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml5.py

Committer: Canonical.com Patch Queue Manager
Date: 2007-01-17 17:21:14 UTC
mfrom: (2229.2.5 reserved-ids)
Revision ID: pqm@pqm.ubuntu.com-20070117172114-dc75493dad46088c

Ensure reserved ids are never stored

files removed:
bzrlib/repofmt

bzrlib/tests/branch_implementations/test_hooks.py

files modified:
HACKING

NEWS

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/cache_utf8.py

bzrlib/decorators.py

bzrlib/errors.py

bzrlib/generate_ids.py

bzrlib/help_topics.py

bzrlib/knit.py

bzrlib/lockable_files.py

bzrlib/merge.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/repository.py

bzrlib/revisiontree.py

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_generate_docs.py

bzrlib/tests/test_generate_ids.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_source.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_xml.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/trace.py

bzrlib/transform.py

bzrlib/version.py

bzrlib/versionedfile.py

bzrlib/workingtree.py

bzrlib/xml5.py

bzrlib/xml6.py

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml5.py

_utf8_re = None

_unicode_re = None

_xml_escape_map = {

_utf8_escape_map = {

"&":'&',

"'":"'", # FIXME: overkill

"\"":""",

def _ensure_utf8_re():

"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""

global _utf8_re, _unicode_re

if _utf8_re is None:

_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')

if _unicode_re is None:

_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _unicode_escape_replace(match, _map=_xml_escape_map):

"""Make sure the _utf8_re regex has been compiled"""

global _utf8_re

if _utf8_re is not None:

return

_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')

def _utf8_escape_replace(match, _map=_utf8_escape_map):

"""Replace a string of non-ascii, non XML safe characters with their escape

This will escape both Standard XML escapes, like <>"', etc.

return "&#%d;" % ord(match.group())

def _utf8_escape_replace(match, _map=_xml_escape_map):

"""Escape utf8 characters into XML safe ones.

This uses 2 tricks. It is either escaping "standard" characters, like "&<>,

or it is handling characters with the high-bit set. For ascii characters,

we just lookup the replacement in the dictionary. For everything else, we

decode back into Unicode, and then use the XML escape code.

"""

try:

return _map[match.group()]

except KeyError:

return ''.join('&#%d;' % ord(uni_chr)

for uni_chr in match.group().decode('utf8'))

_to_escaped_map = {}

def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):

_unicode_to_escaped_map = {}

def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):

"""Encode the string into utf8, and escape invalid XML characters"""

# We frequently get entities we have not seen before, so it is better

# to check if None, rather than try/KeyError

text = _map.get(unicode_or_utf8_str)

text = _map.get(unicode_str)

if text is None:

if unicode_or_utf8_str.__class__ == unicode:

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

100

text = str(_unicode_re.sub(_unicode_escape_replace,

101

unicode_or_utf8_str)) + '"'

102

else:

103

# Plain strings are considered to already be in utf-8 so we do a

104

# slightly different method for escaping.

105

text = _utf8_re.sub(_utf8_escape_replace,

106

unicode_or_utf8_str) + '"'

107

_map[unicode_or_utf8_str] = text

# The alternative policy is to do a regular UTF8 encoding

# and then escape only XML meta characters.

# Performance is equivalent once you use cache_utf8. *However*

# this makes the serialized texts incompatible with old versions

# of bzr. So no net gain. (Perhaps the read code would handle utf8

# better than entity escapes, but cElementTree seems to do just fine

# either way)

text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'

_map[unicode_str] = text

108

return text

109

110

111

def _get_utf8_or_ascii(a_str,

112

_encode_utf8=cache_utf8.encode,

113

_get_cached_ascii=cache_utf8.get_cached_ascii):

114

"""Return a cached version of the string.

115

116

cElementTree will return a plain string if the XML is plain ascii. It only

117

returns Unicode when it needs to. We want to work in utf-8 strings. So if

118

cElementTree returns a plain string, we can just return the cached version.

119

If it is Unicode, then we need to encode it.

120

121

:param a_str: An 8-bit string or Unicode as returned by

122

cElementTree.Element.get()

123

:return: A utf-8 encoded 8-bit string.

124

"""

125

# This is fairly optimized because we know what cElementTree does, this is

126

# not meant as a generic function for all cases. Because it is possible for

127

# an 8-bit string to not be ascii or valid utf8.

128

if a_str.__class__ == unicode:

129

return _encode_utf8(a_str)

130

else:

131

return _get_cached_ascii(a_str)

132

133

134

def _clear_cache():

135

"""Clean out the unicode => escaped map"""

136

_to_escaped_map.clear()

_unicode_to_escaped_map.clear()

137

138

139

class Serializer_v5(Serializer):

225

178

226

179

def _pack_revision(self, rev):

227

180

"""Revision object -> xml tree"""

228

# For the XML format, we need to write them as Unicode rather than as

229

# utf-8 strings. So that cElementTree can handle properly escaping

230

# them.

231

decode_utf8 = cache_utf8.decode

232

revision_id = rev.revision_id

233

if isinstance(revision_id, str):

234

revision_id = decode_utf8(revision_id)

235

181

root = Element('revision',

236

182

committer = rev.committer,

237

183

timestamp = '%.3f' % rev.timestamp,

238

revision_id = revision_id,

184

revision_id = rev.revision_id,

239

185

inventory_sha1 = rev.inventory_sha1,

240

186

format='5',

241

187

)

252

198

assert isinstance(parent_id, basestring)

253

199

p = SubElement(pelts, 'revision_ref')

254

200

p.tail = '\n'

255

if isinstance(parent_id, str):

256

parent_id = decode_utf8(parent_id)

257

201

p.set('revision_id', parent_id)

258

202

if rev.properties:

259

203

self._pack_revision_properties(rev, root)

282

226

% format)

283

227

revision_id = elt.get('revision_id')

284

228

if revision_id is not None:

285

revision_id = cache_utf8.encode(revision_id)

229

revision_id = cache_utf8.get_cached_unicode(revision_id)

286

230

inv = Inventory(root_id, revision_id=revision_id)

287

231

for e in elt:

288

232

ie = self._unpack_entry(e)

296

240

if not InventoryEntry.versionable_kind(kind):

297

241

raise AssertionError('unsupported entry kind %s' % kind)

298

242

299

get_cached = _get_utf8_or_ascii

243

get_cached = cache_utf8.get_cached_unicode

300

244

301

245

parent_id = elt.get('parent_id')

302

246

if parent_id is None and not none_parents:

346

290

if format != '5':

347

291

raise BzrError("invalid format version %r on inventory"

348

292

% format)

349

get_cached = _get_utf8_or_ascii

293

get_cached = cache_utf8.get_cached_unicode

350

294

rev = Revision(committer = elt.get('committer'),

351

295

timestamp = float(elt.get('timestamp')),

352

296

revision_id = get_cached(elt.get('revision_id')),

Older »