~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to bzrlib/xml_serializer.py

Committer: Patch Queue Manager
Date: 2011-12-12 14:47:03 UTC
mfrom: (6355.1.9 serializer-avoids-xml)
Revision ID: pqm@pqm.ubuntu.com-20111212144703-suptg74yxhcpon4p

(jelmer) Avoid loading XML modules when importing CHKSerializer. (Jelmer
Vernooij)

files modified:
bzrlib/chk_serializer.py

bzrlib/tests/test_import_tariff.py

bzrlib/tests/test_xml.py

bzrlib/xml5.py

bzrlib/xml8.py

bzrlib/xml_serializer.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml_serializer.py

import util.elementtree as elementtree

from xml.parsers.expat import ExpatError as ParseError

from bzrlib import errors

from bzrlib import (

cache_utf8,

inventory,

lazy_regex,

errors,

)

class XMLSerializer(Serializer):

130

135

return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',

131

136

lambda match: match.group(0).encode('unicode_escape'),

132

137

message)

138

139

140

def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):

141

"""Return a cached version of the string.

142

143

cElementTree will return a plain string if the XML is plain ascii. It only

144

returns Unicode when it needs to. We want to work in utf-8 strings. So if

145

cElementTree returns a plain string, we can just return the cached version.

146

If it is Unicode, then we need to encode it.

147

148

:param a_str: An 8-bit string or Unicode as returned by

149

cElementTree.Element.get()

150

:return: A utf-8 encoded 8-bit string.

151

"""

152

# This is fairly optimized because we know what cElementTree does, this is

153

# not meant as a generic function for all cases. Because it is possible for

154

# an 8-bit string to not be ascii or valid utf8.

155

if a_str.__class__ is unicode:

156

return _encode_utf8(a_str)

157

else:

158

return intern(a_str)

159

160

161

_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')

162

_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')

163

164

165

_xml_escape_map = {

166

"&":'&',

167

"'":"'", # FIXME: overkill

168

"\"":""",

169

"<":"<",

170

">":">",

171

}

172

173

174

def _unicode_escape_replace(match, _map=_xml_escape_map):

175

"""Replace a string of non-ascii, non XML safe characters with their escape

176

177

This will escape both Standard XML escapes, like <>"', etc.

178

As well as escaping non ascii characters, because ElementTree did.

179

This helps us remain compatible to older versions of bzr. We may change

180

our policy in the future, though.

181

"""

182

# jam 20060816 Benchmarks show that try/KeyError is faster if you

183

# expect the entity to rarely miss. There is about a 10% difference

184

# in overall time. But if you miss frequently, then if None is much

185

# faster. For our use case, we *rarely* have a revision id, file id

186

# or path name that is unicode. So use try/KeyError.

187

try:

188

return _map[match.group()]

189

except KeyError:

190

return "&#%d;" % ord(match.group())

191

192

193

def _utf8_escape_replace(match, _map=_xml_escape_map):

194

"""Escape utf8 characters into XML safe ones.

195

196

This uses 2 tricks. It is either escaping "standard" characters, like "&<>,

197

or it is handling characters with the high-bit set. For ascii characters,

198

we just lookup the replacement in the dictionary. For everything else, we

199

decode back into Unicode, and then use the XML escape code.

200

"""

201

try:

202

return _map[match.group()]

203

except KeyError:

204

return ''.join('&#%d;' % ord(uni_chr)

205

for uni_chr in match.group().decode('utf8'))

206

207

208

_to_escaped_map = {}

209

210

def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):

211

"""Encode the string into utf8, and escape invalid XML characters"""

212

# We frequently get entities we have not seen before, so it is better

213

# to check if None, rather than try/KeyError

214

text = _map.get(unicode_or_utf8_str)

215

if text is None:

216

if unicode_or_utf8_str.__class__ is unicode:

217

# The alternative policy is to do a regular UTF8 encoding

218

# and then escape only XML meta characters.

219

# Performance is equivalent once you use cache_utf8. *However*

220

# this makes the serialized texts incompatible with old versions

221

# of bzr. So no net gain. (Perhaps the read code would handle utf8

222

# better than entity escapes, but cElementTree seems to do just fine

223

# either way)

224

text = str(_unicode_re.sub(_unicode_escape_replace,

225

unicode_or_utf8_str)) + '"'

226

else:

227

# Plain strings are considered to already be in utf-8 so we do a

228

# slightly different method for escaping.

229

text = _utf8_re.sub(_utf8_escape_replace,

230

unicode_or_utf8_str) + '"'

231

_map[unicode_or_utf8_str] = text

232

return text

233

234

235

def _clear_cache():

236

"""Clean out the unicode => escaped map"""

237

_to_escaped_map.clear()

238

239

240

def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):

241

elt_get = elt.get

242

file_id = elt_get('file_id')

243

revision = elt_get('revision')

244

# Check and see if we have already unpacked this exact entry

245

# Some timings for "repo.revision_trees(last_100_revs)"

246

# bzr mysql

247

# unmodified 4.1s 40.8s

248

# using lru 3.5s

249

# using fifo 2.83s 29.1s

250

# lru._cache 2.8s

251

# dict 2.75s 26.8s

252

# inv.add 2.5s 26.0s

253

# no_copy 2.00s 20.5s

254

# no_c,dict 1.95s 18.0s

255

# Note that a cache of 10k nodes is more than sufficient to hold all of

256

# the inventory for the last 100 revs for bzr, but not for mysql (20k

257

# is enough for mysql, which saves the same 2s as using a dict)

258

259

# Breakdown of mysql using time.clock()

260

# 4.1s 2 calls to element.get for file_id, revision_id

261

# 4.5s cache_hit lookup

262

# 7.1s InventoryFile.copy()

263

# 2.4s InventoryDirectory.copy()

264

# 0.4s decoding unique entries

265

# 1.6s decoding entries after FIFO fills up

266

# 0.8s Adding nodes to FIFO (including flushes)

267

# 0.1s cache miss lookups

268

# Using an LRU cache

269

# 4.1s 2 calls to element.get for file_id, revision_id

270

# 9.9s cache_hit lookup

271

# 10.8s InventoryEntry.copy()

272

# 0.3s cache miss lookus

273

# 1.2s decoding entries

274

# 1.0s adding nodes to LRU

275

if entry_cache is not None and revision is not None:

276

key = (file_id, revision)

277

try:

278

# We copy it, because some operations may mutate it

279

cached_ie = entry_cache[key]

280

except KeyError:

281

pass

282

else:

283

# Only copying directory entries drops us 2.85s => 2.35s

284

if return_from_cache:

285

if cached_ie.kind == 'directory':

286

return cached_ie.copy()

287

return cached_ie

288

return cached_ie.copy()

289

290

kind = elt.tag

291

if not inventory.InventoryEntry.versionable_kind(kind):

292

raise AssertionError('unsupported entry kind %s' % kind)

293

294

file_id = get_utf8_or_ascii(file_id)

295

if revision is not None:

296

revision = get_utf8_or_ascii(revision)

297

parent_id = elt_get('parent_id')

298

if parent_id is not None:

299

parent_id = get_utf8_or_ascii(parent_id)

300

301

if kind == 'directory':

302

ie = inventory.InventoryDirectory(file_id,

303

elt_get('name'),

304

parent_id)

305

elif kind == 'file':

306

ie = inventory.InventoryFile(file_id,

307

elt_get('name'),

308

parent_id)

309

ie.text_sha1 = elt_get('text_sha1')

310

if elt_get('executable') == 'yes':

311

ie.executable = True

312

v = elt_get('text_size')

313

ie.text_size = v and int(v)

314

elif kind == 'symlink':

315

ie = inventory.InventoryLink(file_id,

316

elt_get('name'),

317

parent_id)

318

ie.symlink_target = elt_get('symlink_target')

319

else:

320

raise errors.UnsupportedInventoryKind(kind)

321

ie.revision = revision

322

if revision is not None and entry_cache is not None:

323

# We cache a copy() because callers like to mutate objects, and

324

# that would cause the item in cache to mutate as well.

325

# This has a small effect on many-inventory performance, because

326

# the majority fraction is spent in cache hits, not misses.

327

entry_cache[key] = ie.copy()

328

329

return ie

330

331

332

def unpack_inventory_flat(elt, format_num, unpack_entry,

333

entry_cache=None, return_from_cache=False):

334

"""Unpack a flat XML inventory.

335

336

:param elt: XML element for the inventory

337

:param format_num: Expected format number

338

:param unpack_entry: Function for unpacking inventory entries

339

:return: An inventory

340

:raise UnexpectedInventoryFormat: When unexpected elements or data is

341

encountered

342

"""

343

if elt.tag != 'inventory':

344

raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)

345

format = elt.get('format')

346

if format != format_num:

347

raise errors.UnexpectedInventoryFormat('Invalid format version %r'

348

% format)

349

revision_id = elt.get('revision_id')

350

if revision_id is not None:

351

revision_id = cache_utf8.encode(revision_id)

352

inv = inventory.Inventory(root_id=None, revision_id=revision_id)

353

for e in elt:

354

ie = unpack_entry(e, entry_cache, return_from_cache)

355

inv.add(ie)

356

return inv

357

358

359

def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):

360

"""Serialize an inventory to a flat XML file.

361

362

:param inv: Inventory to serialize

363

:param append: Function for writing a line of output

364

:param working: If True skip history data - text_sha1, text_size,

365

reference_revision, symlink_target. self._check_revisions(inv)

366

"""

367

entries = inv.iter_entries()

368

# Skip the root

369

root_path, root_ie = entries.next()

370

for path, ie in entries:

371

if ie.parent_id != root_id:

372

parent_str = ' parent_id="'

373

parent_id = encode_and_escape(ie.parent_id)

374

else:

375

parent_str = ''

376

parent_id = ''

377

if ie.kind == 'file':

378

if ie.executable:

379

executable = ' executable="yes"'

380

else:

381

executable = ''

382

if not working:

383

append('<file%s file_id="%s name="%s%s%s revision="%s '

384

'text_sha1="%s" text_size="%d" />\n' % (

385

executable, encode_and_escape(ie.file_id),

386

encode_and_escape(ie.name), parent_str, parent_id,

387

encode_and_escape(ie.revision), ie.text_sha1,

388

ie.text_size))

389

else:

390

append('<file%s file_id="%s name="%s%s%s />\n' % (

391

executable, encode_and_escape(ie.file_id),

392

encode_and_escape(ie.name), parent_str, parent_id))

393

elif ie.kind == 'directory':

394

if not working:

395

append('<directory file_id="%s name="%s%s%s revision="%s '

396

'/>\n' % (

397

encode_and_escape(ie.file_id),

398

encode_and_escape(ie.name),

399

parent_str, parent_id,

400

encode_and_escape(ie.revision)))

401

else:

402

append('<directory file_id="%s name="%s%s%s />\n' % (

403

encode_and_escape(ie.file_id),

404

encode_and_escape(ie.name),

405

parent_str, parent_id))

406

elif ie.kind == 'symlink':

407

if not working:

408

append('<symlink file_id="%s name="%s%s%s revision="%s '

409

'symlink_target="%s />\n' % (

410

encode_and_escape(ie.file_id),

411

encode_and_escape(ie.name),

412

parent_str, parent_id,

413

encode_and_escape(ie.revision),

414

encode_and_escape(ie.symlink_target)))

415

else:

416

append('<symlink file_id="%s name="%s%s%s />\n' % (

417

encode_and_escape(ie.file_id),

418

encode_and_escape(ie.name),

419

parent_str, parent_id))

420

elif ie.kind == 'tree-reference':

421

if ie.kind not in supported_kinds:

422

raise errors.UnsupportedInventoryKind(ie.kind)

423

if not working:

424

append('<tree-reference file_id="%s name="%s%s%s '

425

'revision="%s reference_revision="%s />\n' % (

426

encode_and_escape(ie.file_id),

427

encode_and_escape(ie.name),

428

parent_str, parent_id,

429

encode_and_escape(ie.revision),

430

encode_and_escape(ie.reference_revision)))

431

else:

432

append('<tree-reference file_id="%s name="%s%s%s />\n' % (

433

encode_and_escape(ie.file_id),

434

encode_and_escape(ie.name),

435

parent_str, parent_id))

436

else:

437

raise errors.UnsupportedInventoryKind(ie.kind)

438

append('</inventory>\n')

Older »