~bzr-pqm/bzr/bzr.dev

« back to all changes in this revision

Viewing changes to elementtree/ElementTree.py

Committer: mbp at sourcefrog
Date: 2005-03-23 06:25:55 UTC
Revision ID: mbp@sourcefrog.net-20050323062555-5489339018d0c043

- import a subset of elementtree for easier installation

files added:
elementtree

elementtree/ElementTree.py

elementtree/__init__.py

files modified:
NEWS

Show diffs side-by-side

added added

removed removed

elementtree/ElementTree.py

# ElementTree

# $Id: ElementTree.py 1862 2004-06-18 07:31:02Z Fredrik $

# light-weight XML support for Python 1.5.2 and later.

# this is a stripped-down version of Secret Labs' effDOM library (part

# of xmlToolkit). compared to effDOM, this implementation has:

# - no support for observers

# - no html-specific extensions (e.g. entity preload)

# - no custom entities, doctypes, etc

# - no accelerator module

# history:

# 2001-10-20 fl created (from various sources)

# 2001-11-01 fl return root from parse method

# 2002-02-16 fl sort attributes in lexical order

# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup

# 2002-05-01 fl finished TreeBuilder refactoring

# 2002-07-14 fl added basic namespace support to ElementTree.write

# 2002-07-25 fl added QName attribute support

# 2002-10-20 fl fixed encoding in write

# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding

# 2002-11-27 fl accept file objects or file names for parse/write

# 2002-12-04 fl moved XMLTreeBuilder back to this module

# 2003-01-11 fl fixed entity encoding glitch for us-ascii

# 2003-02-13 fl added XML literal factory

# 2003-02-21 fl added ProcessingInstruction/PI factory

# 2003-05-11 fl added tostring/fromstring helpers

# 2003-05-26 fl added ElementPath support

# 2003-07-05 fl added makeelement factory method

# 2003-07-28 fl added more well-known namespace prefixes

# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)

# 2003-09-04 fl fall back on emulator if ElementPath is not installed

# 2003-10-31 fl markup updates

# 2003-11-15 fl fixed nested namespace bug

# 2004-03-28 fl added XMLID helper

# 2004-06-02 fl added default support to findtext

# 2004-06-08 fl fixed encoding of non-ascii element/attribute names

# fredrik@pythonware.com

# http://www.pythonware.com

# --------------------------------------------------------------------

# The ElementTree toolkit is

# By obtaining, using, and/or copying this software and/or its

# associated documentation, you agree that you have read, understood,

# and will comply with the following terms and conditions:

# Permission to use, copy, modify, and distribute this software and

# its associated documentation for any purpose and without fee is

# hereby granted, provided that the above copyright notice appears in

# all copies, and that both that copyright notice and this permission

# notice appear in supporting documentation, and that the name of

# Secret Labs AB or the author not be used in advertising or publicity

# pertaining to distribution of the software without specific, written

# prior permission.

# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

# OF THIS SOFTWARE.

# --------------------------------------------------------------------

__all__ = [

# public symbols

"Comment",

"dump",

"Element", "ElementTree",

"fromstring",

"iselement",

"parse",

"PI", "ProcessingInstruction",

"QName",

"SubElement",

"tostring",

"TreeBuilder",

"VERSION", "XML",

"XMLTreeBuilder",

]

# The Element type is a flexible container object, designed to

# store hierarchical data structures in memory. The type can be

# described as a cross between a list and a dictionary.

#

# Each element has a number of properties associated with it:

# <ul>

# <li>a tag. This is a string identifying what kind of data

100

# this element represents (the element type, in other words).</li>

101

# <li>a number of attributes, stored in a Python dictionary.</li>

102

# <li>a text string.</li>

103

# <li>an optional tail string.</li>

104

# <li>a number of child elements, stored in a Python sequence</li>

105

# </ul>

106

107

# To create an element instance, use the {@link #Element} or {@link

108

# #SubElement} factory functions.

109

#

110

# The {@link #ElementTree} class can be used to wrap an element

111

# structure, and convert it from and to XML.

112

113

114

import string, sys, re

115

116

class _SimpleElementPath:

117

# emulate pre-1.2 find/findtext/findall behaviour

118

def find(self, element, tag):

119

for elem in element:

120

if elem.tag == tag:

121

return elem

122

return None

123

def findtext(self, element, tag, default=None):

124

for elem in element:

125

if elem.tag == tag:

126

return elem.text or ""

127

return default

128

def findall(self, element, tag):

129

if tag[:3] == ".//":

130

return element.getiterator(tag[3:])

131

result = []

132

for elem in element:

133

if elem.tag == tag:

134

result.append(elem)

135

return result

136

137

try:

138

import ElementPath

139

except ImportError:

140

# FIXME: issue warning in this case?

141

ElementPath = _SimpleElementPath()

142

143

# TODO: add support for custom namespace resolvers/default namespaces

144

# TODO: add improved support for incremental parsing

145

146

VERSION = "1.2"

147

148

149

# Internal element class. This class defines the Element interface,

150

# and provides a reference implementation of this interface.

151

#

152

# You should not create instances of this class directly. Use the

153

# appropriate factory functions instead, such as {@link #Element}

154

# and {@link #SubElement}.

155

156

# @see Element

157

# @see SubElement

158

# @see Comment

159

# @see ProcessingInstruction

160

161

class _ElementInterface:

162

# <tag attrib>text<child/>...</tag>tail

163

164

165

# (Attribute) Element tag.

166

167

tag = None

168

169

170

# (Attribute) Element attribute dictionary. Where possible, use

171

# {@link #_ElementInterface.get},

172

# {@link #_ElementInterface.set},

173

# {@link #_ElementInterface.keys}, and

174

# {@link #_ElementInterface.items} to access

175

# element attributes.

176

177

attrib = None

178

179

180

# (Attribute) Text before first subelement. This is either a

181

# string or the value None, if there was no text.

182

183

text = None

184

185

186

# (Attribute) Text after this element's end tag, but before the

187

# next sibling element's start tag. This is either a string or

188

# the value None, if there was no text.

189

190

tail = None # text after end tag, if any

191

192

def __init__(self, tag, attrib):

193

self.tag = tag

194

self.attrib = attrib

195

self._children = []

196

197

def __repr__(self):

198

return "<Element %s at %x>" % (self.tag, id(self))

199

200

201

# Creates a new element object of the same type as this element.

202

203

# @param tag Element tag.

204

# @param attrib Element attributes, given as a dictionary.

205

# @return A new element instance.

206

207

def makeelement(self, tag, attrib):

208

return Element(tag, attrib)

209

210

211

# Returns the number of subelements.

212

213

# @return The number of subelements.

214

215

def __len__(self):

216

return len(self._children)

217

218

219

# Returns the given subelement.

220

221

# @param index What subelement to return.

222

# @return The given subelement.

223

# @exception IndexError If the given element does not exist.

224

225

def __getitem__(self, index):

226

return self._children[index]

227

228

229

# Replaces the given subelement.

230

231

# @param index What subelement to replace.

232

# @param element The new element value.

233

# @exception IndexError If the given element does not exist.

234

# @exception AssertionError If element is not a valid object.

235

236

def __setitem__(self, index, element):

237

assert iselement(element)

238

self._children[index] = element

239

240

241

# Deletes the given subelement.

242

243

# @param index What subelement to delete.

244

# @exception IndexError If the given element does not exist.

245

246

def __delitem__(self, index):

247

del self._children[index]

248

249

250

# Returns a list containing subelements in the given range.

251

252

# @param start The first subelement to return.

253

# @param stop The first subelement that shouldn't be returned.

254

# @return A sequence object containing subelements.

255

256

def __getslice__(self, start, stop):

257

return self._children[start:stop]

258

259

260

# Replaces a number of subelements with elements from a sequence.

261

262

# @param start The first subelement to replace.

263

# @param stop The first subelement that shouldn't be replaced.

264

# @param elements A sequence object with zero or more elements.

265

# @exception AssertionError If a sequence member is not a valid object.

266

267

def __setslice__(self, start, stop, elements):

268

for element in elements:

269

assert iselement(element)

270

self._children[start:stop] = list(elements)

271

272

273

# Deletes a number of subelements.

274

275

# @param start The first subelement to delete.

276

# @param stop The first subelement to leave in there.

277

278

def __delslice__(self, start, stop):

279

del self._children[start:stop]

280

281

282

# Adds a subelement to the end of this element.

283

284

# @param element The element to add.

285

# @exception AssertionError If a sequence member is not a valid object.

286

287

def append(self, element):

288

assert iselement(element)

289

self._children.append(element)

290

291

292

# Inserts a subelement at the given position in this element.

293

294

# @param index Where to insert the new subelement.

295

# @exception AssertionError If the element is not a valid object.

296

297

def insert(self, index, element):

298

assert iselement(element)

299

self._children.insert(index, element)

300

301

302

# Removes a matching subelement. Unlike the find methods,

303

# this method compares elements based on identity, not on tag

304

# value or contents.

305

306

# @param element What element to remove.

307

# @exception ValueError If a matching element could not be found.

308

# @exception AssertionError If the element is not a valid object.

309

310

def remove(self, element):

311

assert iselement(element)

312

self._children.remove(element)

313

314

315

# Returns all subelements. The elements are returned in document

316

# order.

317

318

# @return A list of subelements.

319

# @defreturn list of Element instances

320

321

def getchildren(self):

322

return self._children

323

324

325

# Finds the first matching subelement, by tag name or path.

326

327

# @param path What element to look for.

328

# @return The first matching element, or None if no element was found.

329

# @defreturn Element or None

330

331

def find(self, path):

332

return ElementPath.find(self, path)

333

334

335

# Finds text for the first matching subelement, by tag name or path.

336

337

# @param path What element to look for.

338

# @param default What to return if the element was not found.

339

# @return The text content of the first matching element, or the

340

# default value no element was found. Note that if the element

341

# has is found, but has no text content, this method returns an

342

# empty string.

343

# @defreturn string

344

345

def findtext(self, path, default=None):

346

return ElementPath.findtext(self, path, default)

347

348

349

# Finds all matching subelements, by tag name or path.

350

351

# @param path What element to look for.

352

# @return A list or iterator containing all matching elements,

353

# in document order.

354

# @defreturn list of Element instances

355

356

def findall(self, path):

357

return ElementPath.findall(self, path)

358

359

360

# Resets an element. This function removes all subelements, clears

361

# all attributes, and sets the text and tail attributes to None.

362

363

def clear(self):

364

self.attrib.clear()

365

self._children = []

366

self.text = self.tail = None

367

368

369

# Gets an element attribute.

370

371

# @param key What attribute to look for.

372

# @param default What to return if the attribute was not found.

373

# @return The attribute value, or the default value, if the

374

# attribute was not found.

375

# @defreturn string or None

376

377

def get(self, key, default=None):

378

return self.attrib.get(key, default)

379

380

381

# Sets an element attribute.

382

383

# @param key What attribute to set.

384

# @param value The attribute value.

385

386

def set(self, key, value):

387

self.attrib[key] = value

388

389

390

# Gets a list of attribute names. The names are returned in an

391

# arbitrary order (just like for an ordinary Python dictionary).

392

393

# @return A list of element attribute names.

394

# @defreturn list of strings

395

396

def keys(self):

397

return self.attrib.keys()

398

399

400

# Gets element attributes, as a sequence. The attributes are

401

# returned in an arbitrary order.

402

403

# @return A list of (name, value) tuples for all attributes.

404

# @defreturn list of (string, string) tuples

405

406

def items(self):

407

return self.attrib.items()

408

409

410

# Creates a tree iterator. The iterator loops over this element

411

# and all subelements, in document order, and returns all elements

412

# with a matching tag.

413

#

414

# If the tree structure is modified during iteration, the result

415

# is undefined.

416

417

# @param tag What tags to look for (default is to return all elements).

418

# @return A list or iterator containing all the matching elements.

419

# @defreturn list or iterator

420

421

def getiterator(self, tag=None):

422

nodes = []

423

if tag == "*":

424

tag = None

425

if tag is None or self.tag == tag:

426

nodes.append(self)

427

for node in self._children:

428

nodes.extend(node.getiterator(tag))

429

return nodes

430

431

# compatibility

432

_Element = _ElementInterface

433

434

435

# Element factory. This function returns an object implementing the

436

# standard Element interface. The exact class or type of that object

437

# is implementation dependent, but it will always be compatible with

438

# the {@link #_ElementInterface} class in this module.

439

#

440

# The element name, attribute names, and attribute values can be

441

# either 8-bit ASCII strings or Unicode strings.

442

443

# @param tag The element name.

444

# @param attrib An optional dictionary, containing element attributes.

445

# @param **extra Additional attributes, given as keyword arguments.

446

# @return An element instance.

447

# @defreturn Element

448

449

def Element(tag, attrib={}, **extra):

450

attrib = attrib.copy()

451

attrib.update(extra)

452

return _ElementInterface(tag, attrib)

453

454

455

# Subelement factory. This function creates an element instance, and

456

# appends it to an existing element.

457

#

458

# The element name, attribute names, and attribute values can be

459

# either 8-bit ASCII strings or Unicode strings.

460

461

# @param parent The parent element.

462

# @param tag The subelement name.

463

# @param attrib An optional dictionary, containing element attributes.

464

# @param **extra Additional attributes, given as keyword arguments.

465

# @return An element instance.

466

# @defreturn Element

467

468

def SubElement(parent, tag, attrib={}, **extra):

469

attrib = attrib.copy()

470

attrib.update(extra)

471

element = parent.makeelement(tag, attrib)

472

parent.append(element)

473

return element

474

475

476

# Comment element factory. This factory function creates a special

477

# element that will be serialized as an XML comment.

478

#

479

# The comment string can be either an 8-bit ASCII string or a Unicode

480

# string.

481

482

# @param text A string containing the comment string.

483

# @return An element instance, representing a comment.

484

# @defreturn Element

485

486

def Comment(text=None):

487

element = Element(Comment)

488

element.text = text

489

return element

490

491

492

# PI element factory. This factory function creates a special element

493

# that will be serialized as an XML processing instruction.

494

495

# @param target A string containing the PI target.

496

# @param text A string containing the PI contents, if any.

497

# @return An element instance, representing a PI.

498

# @defreturn Element

499

500

def ProcessingInstruction(target, text=None):

501

element = Element(ProcessingInstruction)

502

element.text = target

503

if text:

504

element.text = element.text + " " + text

505

return element

506

507

PI = ProcessingInstruction

508

509

510

# QName wrapper. This can be used to wrap a QName attribute value, in

511

# order to get proper namespace handling on output.

512

513

# @param text A string containing the QName value, in the form {uri}local,

514

# or, if the tag argument is given, the URI part of a QName.

515

# @param tag Optional tag. If given, the first argument is interpreted as

516

# an URI, and this argument is interpreted as a local name.

517

# @return An opaque object, representing the QName.

518

519

class QName:

520

def __init__(self, text_or_uri, tag=None):

521

if tag:

522

text_or_uri = "{%s}%s" % (text_or_uri, tag)

523

self.text = text_or_uri

524

def __str__(self):

525

return self.text

526

def __hash__(self):

527

return hash(self.text)

528

def __cmp__(self, other):

529

if isinstance(other, QName):

530

return cmp(self.text, other.text)

531

return cmp(self.text, other)

532

533

534

# ElementTree wrapper class. This class represents an entire element

535

# hierarchy, and adds some extra support for serialization to and from

536

# standard XML.

537

538

# @param element Optional root element.

539

# @keyparam file Optional file handle or name. If given, the

540

# tree is initialized with the contents of this XML file.

541

542

class ElementTree:

543

544

def __init__(self, element=None, file=None):

545

assert element is None or iselement(element)

546

self._root = element # first node

547

if file:

548

self.parse(file)

549

550

551

# Gets the root element for this tree.

552

553

# @return An element instance.

554

# @defreturn Element

555

556

def getroot(self):

557

return self._root

558

559

560

# Replaces the root element for this tree. This discards the

561

# current contents of the tree, and replaces it with the given

562

# element. Use with care.

563

564

# @param element An element instance.

565

566

def _setroot(self, element):

567

assert iselement(element)

568

self._root = element

569

570

571

# Loads an external XML document into this element tree.

572

573

# @param source A file name or file object.

574

# @param parser An optional parser instance. If not given, the

575

# standard {@link XMLTreeBuilder} parser is used.

576

# @return The document root element.

577

# @defreturn Element

578

579

def parse(self, source, parser=None):

580

if not hasattr(source, "read"):

581

source = open(source, "rb")

582

if not parser:

583

parser = XMLTreeBuilder()

584

while 1:

585

data = source.read(32768)

586

if not data:

587

break

588

parser.feed(data)

589

self._root = parser.close()

590

return self._root

591

592

593

# Creates a tree iterator for the root element. The iterator loops

594

# over all elements in this tree, in document order.

595

596

# @param tag What tags to look for (default is to return all elements)

597

# @return An iterator.

598

# @defreturn iterator

599

600

def getiterator(self, tag=None):

601

assert self._root is not None

602

return self._root.getiterator(tag)

603

604

605

# Finds the first toplevel element with given tag.

606

# Same as getroot().find(path).

607

608

# @param path What element to look for.

609

# @return The first matching element, or None if no element was found.

610

# @defreturn Element or None

611

612

def find(self, path):

613

assert self._root is not None

614

if path[:1] == "/":

615

path = "." + path

616

return self._root.find(path)

617

618

619

# Finds the element text for the first toplevel element with given

620

# tag. Same as getroot().findtext(path).

621

622

# @param path What toplevel element to look for.

623

# @param default What to return if the element was not found.

624

# @return The text content of the first matching element, or the

625

# default value no element was found. Note that if the element

626

# has is found, but has no text content, this method returns an

627

# empty string.

628

# @defreturn string

629

630

def findtext(self, path, default=None):

631

assert self._root is not None

632

if path[:1] == "/":

633

path = "." + path

634

return self._root.findtext(path, default)

635

636

637

# Finds all toplevel elements with the given tag.

638

# Same as getroot().findall(path).

639

640

# @param path What element to look for.

641

# @return A list or iterator containing all matching elements,

642

# in document order.

643

# @defreturn list of Element instances

644

645

def findall(self, path):

646

assert self._root is not None

647

if path[:1] == "/":

648

path = "." + path

649

return self._root.findall(path)

650

651

652

# Writes the element tree to a file, as XML.

653

654

# @param file A file name, or a file object opened for writing.

655

# @param encoding Optional output encoding (default is US-ASCII).

656

657

def write(self, file, encoding="us-ascii"):

658

assert self._root is not None

659

if not hasattr(file, "write"):

660

file = open(file, "wb")

661

if not encoding:

662

encoding = "us-ascii"

663

elif encoding != "utf-8" and encoding != "us-ascii":

664

file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)

665

self._write(file, self._root, encoding, {})

666

667

def _write(self, file, node, encoding, namespaces):

668

# write XML to file

669

tag = node.tag

670

if tag is Comment:

671

file.write("" % _escape_cdata(node.text, encoding))

672

elif tag is ProcessingInstruction:

673

file.write("<?%s?>" % _escape_cdata(node.text, encoding))

674

else:

675

items = node.items()

676

xmlns_items = [] # new namespaces in this scope

677

try:

678

if isinstance(tag, QName) or tag[:1] == "{":

679

tag, xmlns = fixtag(tag, namespaces)

680

if xmlns: xmlns_items.append(xmlns)

681

except TypeError:

682

_raise_serialization_error(tag)

683

file.write("<" + _encode(tag, encoding))

684

if items or xmlns_items:

685

items.sort() # lexical order

686

for k, v in items:

687

try:

688

if isinstance(k, QName) or k[:1] == "{":

689

k, xmlns = fixtag(k, namespaces)

690

if xmlns: xmlns_items.append(xmlns)

691

except TypeError:

692

_raise_serialization_error(k)

693

try:

694

if isinstance(v, QName):

695

v, xmlns = fixtag(v, namespaces)

696

if xmlns: xmlns_items.append(xmlns)

697

except TypeError:

698

_raise_serialization_error(v)

699

file.write(" %s=\"%s\"" % (_encode(k, encoding),

700

_escape_attrib(v, encoding)))

701

for k, v in xmlns_items:

702

file.write(" %s=\"%s\"" % (_encode(k, encoding),

703

_escape_attrib(v, encoding)))

704

if node.text or node:

705

file.write(">")

706

if node.text:

707

file.write(_escape_cdata(node.text, encoding))

708

for n in node:

709

self._write(file, n, encoding, namespaces)

710

file.write("</" + _encode(tag, encoding) + ">")

711

else:

712

file.write(" />")

713

for k, v in xmlns_items:

714

del namespaces[v]

715

if node.tail:

716

file.write(_escape_cdata(node.tail, encoding))

717

718

# --------------------------------------------------------------------

719

# helpers

720

721

722

# Checks if an object appears to be a valid element object.

723

724

# @param An element instance.

725

# @return A true value if this is an element object.

726

# @defreturn flag

727

728

def iselement(element):

729

# FIXME: not sure about this; might be a better idea to look

730

# for tag/attrib/text attributes

731

return isinstance(element, _ElementInterface) or hasattr(element, "tag")

732

733

734

# Writes an element tree or element structure to sys.stdout. This

735

# function should be used for debugging only.

736

#

737

# The exact output format is implementation dependent. In this

738

# version, it's written as an ordinary XML file.

739

740

# @param elem An element tree or an individual element.

741

742

def dump(elem):

743

# debugging

744

if not isinstance(elem, ElementTree):

745

elem = ElementTree(elem)

746

elem.write(sys.stdout)

747

tail = elem.getroot().tail

748

if not tail or tail[-1] != "\n":

749

sys.stdout.write("\n")

750

751

def _encode(s, encoding):

752

try:

753

return s.encode(encoding)

754

except AttributeError:

755

return s # 1.5.2: assume the string uses the right encoding

756

757

if sys.version[:3] == "1.5":

758

_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2

759

else:

760

_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))

761

762

_escape_map = {

763

"&": "&",

764

"<": "<",

765

">": ">",

766

'"': """,

767

}

768

769

_namespace_map = {

770

# "well-known" namespace prefixes

771

"http://www.w3.org/XML/1998/namespace": "xml",

772

"http://www.w3.org/1999/xhtml": "html",

773

"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",

774

"http://schemas.xmlsoap.org/wsdl/": "wsdl",

775

}

776

777

def _raise_serialization_error(text):

778

raise TypeError(

779

"cannot serialize %r (type %s)" % (text, type(text).__name__)

780

)

781

782

def _encode_entity(text, pattern=_escape):

783

# map reserved and non-ascii characters to numerical entities

784

def escape_entities(m, map=_escape_map):

785

out = []

786

append = out.append

787

for char in m.group():

788

text = map.get(char)

789

if text is None:

790

text = "&#%d;" % ord(char)

791

append(text)

792

return string.join(out, "")

793

try:

794

return _encode(pattern.sub(escape_entities, text), "ascii")

795

except TypeError:

796

_raise_serialization_error(text)

797

798

799

# the following functions assume an ascii-compatible encoding

800

# (or "utf-16")

801

802

def _escape_cdata(text, encoding=None, replace=string.replace):

803

# escape character data

804

try:

805

if encoding:

806

try:

807

text = _encode(text, encoding)

808

except UnicodeError:

809

return _encode_entity(text)

810

text = replace(text, "&", "&")

811

text = replace(text, "<", "<")

812

text = replace(text, ">", ">")

813

return text

814

except (TypeError, AttributeError):

815

_raise_serialization_error(text)

816

817

def _escape_attrib(text, encoding=None, replace=string.replace):

818

# escape attribute value

819

try:

820

if encoding:

821

try:

822

text = _encode(text, encoding)

823

except UnicodeError:

824

return _encode_entity(text)

825

text = replace(text, "&", "&")

826

text = replace(text, "'", "'") # FIXME: overkill

827

text = replace(text, "\"", """)

828

text = replace(text, "<", "<")

829

text = replace(text, ">", ">")

830

return text

831

except (TypeError, AttributeError):

832

_raise_serialization_error(text)

833

834

def fixtag(tag, namespaces):

835

# given a decorated tag (of the form {uri}tag), return prefixed

836

# tag and namespace declaration, if any

837

if isinstance(tag, QName):

838

tag = tag.text

839

namespace_uri, tag = string.split(tag[1:], "}", 1)

840

prefix = namespaces.get(namespace_uri)

841

if prefix is None:

842

prefix = _namespace_map.get(namespace_uri)

843

if prefix is None:

844

prefix = "ns%d" % len(namespaces)

845

namespaces[namespace_uri] = prefix

846

if prefix == "xml":

847

xmlns = None

848

else:

849

xmlns = ("xmlns:%s" % prefix, namespace_uri)

850

else:

851

xmlns = None

852

return "%s:%s" % (prefix, tag), xmlns

853

854

855

# Parses an XML document into an element tree.

856

857

# @param source A filename or file object containing XML data.

858

# @param parser An optional parser instance. If not given, the

859

# standard {@link XMLTreeBuilder} parser is used.

860

# @return An ElementTree instance

861

862

def parse(source, parser=None):

863

tree = ElementTree()

864

tree.parse(source, parser)

865

return tree

866

867

868

# Parses an XML document from a string constant. This function can

869

# be used to embed "XML literals" in Python code.

870

871

# @param source A string containing XML data.

872

# @return An Element instance.

873

# @defreturn Element

874

875

def XML(text):

876

parser = XMLTreeBuilder()

877

parser.feed(text)

878

return parser.close()

879

880

881

# Parses an XML document from a string constant, and also returns

882

# a dictionary which maps from element id:s to elements.

883

884

# @param source A string containing XML data.

885

# @return A tuple containing an Element instance and a dictionary.

886

# @defreturn (Element, dictionary)

887

888

def XMLID(text):

889

parser = XMLTreeBuilder()

890

parser.feed(text)

891

tree = parser.close()

892

ids = {}

893

for elem in tree.getiterator():

894

id = elem.get("id")

895

if id:

896

ids[id] = elem

897

return tree, ids

898

899

900

# Parses an XML document from a string constant. Same as {@link #XML}.

901

902

# @def fromstring(text)

903

# @param source A string containing XML data.

904

# @return An Element instance.

905

# @defreturn Element

906

907

fromstring = XML

908

909

910

# Generates a string representation of an XML element, including all

911

# subelements.

912

913

# @param element An Element instance.

914

# @return An encoded string containing the XML data.

915

# @defreturn string

916

917

def tostring(element, encoding=None):

918

class dummy:

919

pass

920

data = []

921

file = dummy()

922

file.write = data.append

923

ElementTree(element).write(file, encoding)

924

return string.join(data, "")

925

926

927

# Generic element structure builder. This builder converts a sequence

928

# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link

929

# #TreeBuilder.end} method calls to a well-formed element structure.

930

#

931

# You can use this class to build an element structure using a custom XML

932

# parser, or a parser for some other XML-like format.

933

934

# @param element_factory Optional element factory. This factory

935

# is called to create new Element instances, as necessary.

936

937

class TreeBuilder:

938

939

def __init__(self, element_factory=None):

940

self._data = [] # data collector

941

self._elem = [] # element stack

942

self._last = None # last element

943

self._tail = None # true if we're after an end tag

944

if element_factory is None:

945

element_factory = _ElementInterface

946

self._factory = element_factory

947

948

949

# Flushes the parser buffers, and returns the toplevel documen

950

# element.

951

952

# @return An Element instance.

953

# @defreturn Element

954

955

def close(self):

956

assert len(self._elem) == 0, "missing end tags"

957

assert self._last != None, "missing toplevel element"

958

return self._last

959

960

def _flush(self):

961

if self._data:

962

if self._last is not None:

963

text = string.join(self._data, "")

964

if self._tail:

965

assert self._last.tail is None, "internal error (tail)"

966

self._last.tail = text

967

else:

968

assert self._last.text is None, "internal error (text)"

969

self._last.text = text

970

self._data = []

971

972

973

# Adds text to the current element.

974

975

# @param data A string. This should be either an 8-bit string

976

# containing ASCII text, or a Unicode string.

977

978

def data(self, data):

979

self._data.append(data)

980

981

982

# Opens a new element.

983

984

# @param tag The element name.

985

# @param attrib A dictionary containing element attributes.

986

# @return The opened element.

987

# @defreturn Element

988

989

def start(self, tag, attrs):

990

self._flush()

991

self._last = elem = self._factory(tag, attrs)

992

if self._elem:

993

self._elem[-1].append(elem)

994

self._elem.append(elem)

995

self._tail = 0

996

return elem

997

998

999

# Closes the current element.

1000

1001

# @param tag The element name.

1002

# @return The closed element.

1003

# @defreturn Element

1004

1005

def end(self, tag):

1006

self._flush()

1007

self._last = self._elem.pop()

1008

assert self._last.tag == tag,\

1009

"end tag mismatch (expected %s, got %s)" % (

1010

self._last.tag, tag)

1011

self._tail = 1

1012

return self._last

1013

1014

1015

# Element structure builder for XML source data, based on the

1016

# expat parser.

1017

1018

# @keyparam target Target object. If omitted, the builder uses an

1019

# instance of the standard {@link #TreeBuilder} class.

1020

# @keyparam html Predefine HTML entities. This flag is not supported

1021

# by the current implementation.

1022

# @see #ElementTree

1023

# @see #TreeBuilder

1024

1025

class XMLTreeBuilder:

1026

1027

def __init__(self, html=0, target=None):

1028

from xml.parsers import expat

1029

self._parser = parser = expat.ParserCreate(None, "}")

1030

if target is None:

1031

target = TreeBuilder()

1032

self._target = target

1033

self._names = {} # name memo cache

1034

parser.DefaultHandler = self._default

1035

parser.StartElementHandler = self._start

1036

parser.EndElementHandler = self._end

1037

parser.CharacterDataHandler = self._data

1038

encoding = None

1039

if not parser.returns_unicode:

1040

encoding = "utf-8"

1041

# target.xml(encoding, None)

1042

self._doctype = None

1043

self.entity = {}

1044

1045

def _fixtext(self, text):

1046

# convert text string to ascii, if possible

1047

try:

1048

return str(text) # what if the default encoding is changed?

1049

except UnicodeError:

1050

return text

1051

1052

def _fixname(self, key):

1053

# expand qname, and convert name string to ascii, if possible

1054

try:

1055

name = self._names[key]

1056

except KeyError:

1057

name = key

1058

if "}" in name:

1059

name = "{" + name

1060

self._names[key] = name = self._fixtext(name)

1061

return name

1062

1063

def _start(self, tag, attrib_in):

1064

fixname = self._fixname

1065

tag = fixname(tag)

1066

attrib = {}

1067

for key, value in attrib_in.items():

1068

attrib[fixname(key)] = self._fixtext(value)

1069

return self._target.start(tag, attrib)

1070

1071

def _data(self, text):

1072

return self._target.data(self._fixtext(text))

1073

1074

def _end(self, tag):

1075

return self._target.end(self._fixname(tag))

1076

1077

def _default(self, text):

1078

prefix = text[:1]

1079

if prefix == "&":

1080

# deal with undefined entities

1081

try:

1082

self._target.data(self.entity[text[1:-1]])

1083

except KeyError:

1084

from xml.parsers import expat

1085

raise expat.error(

1086

"undefined entity %s: line %d, column %d" %

1087

(text, self._parser.ErrorLineNumber,

1088

self._parser.ErrorColumnNumber)

1089

)

1090

elif prefix == "<" and text[:9] == "<!DOCTYPE":

1091

self._doctype = [] # inside a doctype declaration

1092

elif self._doctype is not None:

1093

# parse doctype contents

1094

if prefix == ">":

1095

self._doctype = None

1096

return

1097

text = string.strip(text)

1098

if not text:

1099

return

1100

self._doctype.append(text)

1101

n = len(self._doctype)

1102

if n > 2:

1103

type = self._doctype[1]

1104

if type == "PUBLIC" and n == 4:

1105

name, type, pubid, system = self._doctype

1106

elif type == "SYSTEM" and n == 3:

1107

name, type, system = self._doctype

1108

pubid = None

1109

else:

1110

return

1111

if pubid:

1112

pubid = pubid[1:-1]

1113

self.doctype(name, pubid, system[1:-1])

1114

self._doctype = None

1115

1116

1117

# Handles a doctype declaration.

1118

1119

# @param name Doctype name.

1120

# @param pubid Public identifier.

1121

# @param system System identifier.

1122

1123

def doctype(self, name, pubid, system):

1124

pass

1125

1126

1127

# Feeds data to the parser.

1128

1129

# @param data Encoded data.

1130

1131

def feed(self, data):

1132

self._parser.Parse(data, 0)

1133

1134

1135

# Finishes feeding data to the parser.

1136

1137

# @return An element structure.

1138

# @defreturn Element

1139

1140

def close(self):

1141

self._parser.Parse("", 1) # end of data

1142

tree = self._target.close()

1143

del self._target, self._parser # get rid of circular references

1144

return tree

Older »